from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
!pip install catboost shap
Collecting catboost
Downloading https://files.pythonhosted.org/packages/20/37/bc4e0ddc30c07a96482abf1de7ed1ca54e59bba2026a33bca6d2ef286e5b/catboost-0.24.4-cp36-none-manylinux1_x86_64.whl (65.7MB)
|████████████████████████████████| 65.8MB 46kB/s
Collecting shap
Downloading https://files.pythonhosted.org/packages/85/a3/c0eab9dd6a894165e2cb87504ff5b2710ac5ede3447d9138620b7341b6a2/shap-0.37.0.tar.gz (326kB)
|████████████████████████████████| 327kB 53.9MB/s
Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from catboost) (3.2.2)
Requirement already satisfied: plotly in /usr/local/lib/python3.6/dist-packages (from catboost) (4.4.1)
Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from catboost) (1.4.1)
Requirement already satisfied: pandas>=0.24.0 in /usr/local/lib/python3.6/dist-packages (from catboost) (1.1.5)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from catboost) (1.15.0)
Requirement already satisfied: graphviz in /usr/local/lib/python3.6/dist-packages (from catboost) (0.10.1)
Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from catboost) (1.19.4)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (from shap) (0.22.2.post1)
Requirement already satisfied: tqdm>4.25.0 in /usr/local/lib/python3.6/dist-packages (from shap) (4.41.1)
Collecting slicer==0.0.3
Downloading https://files.pythonhosted.org/packages/02/a6/c708c5a0f338e99cfbcb6288b88794525548e4fc1b8457feec2c552a81a4/slicer-0.0.3-py3-none-any.whl
Requirement already satisfied: numba in /usr/local/lib/python3.6/dist-packages (from shap) (0.48.0)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (0.10.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (1.3.1)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (2.8.1)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (2.4.7)
Requirement already satisfied: retrying>=1.3.3 in /usr/local/lib/python3.6/dist-packages (from plotly->catboost) (1.3.3)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.24.0->catboost) (2018.9)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->shap) (1.0.0)
Requirement already satisfied: llvmlite<0.32.0,>=0.31.0dev0 in /usr/local/lib/python3.6/dist-packages (from numba->shap) (0.31.0)
Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from numba->shap) (51.1.1)
Building wheels for collected packages: shap
Building wheel for shap (setup.py) ... done
Created wheel for shap: filename=shap-0.37.0-cp36-cp36m-linux_x86_64.whl size=463917 sha256=58eaf974d0b0a0e58d2eec27a92061c1edc9eba802434c5a94f36b9cf2b9fcba
Stored in directory: /root/.cache/pip/wheels/df/ad/b0/aa7815ec68850d66551ef618095eccb962c8f6022f1d3dd989
Successfully built shap
Installing collected packages: catboost, slicer, shap
Successfully installed catboost-0.24.4 shap-0.37.0 slicer-0.0.3
from catboost import Pool, cv, CatBoostClassifier, CatBoostRegressor,CatBoost
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import gc
import matplotlib.pyplot as plt
from sklearn import metrics
import shap
import gc
from copy import deepcopy
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OrdinalEncoder
import pandas as pd
from sklearn.model_selection import train_test_split, GroupShuffleSplit, GridSearchCV
import numpy as np
from tensorflow.keras.metrics import TopKCategoricalAccuracy, Precision, SparseTopKCategoricalAccuracy # @4
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.linear_model import LogisticRegressionCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
%matplotlib inline
pd.set_option("display.max_columns", 90)
!nvidia-smi -L
GPU 0: Tesla P4 (UUID: GPU-60de6523-4efa-0ab1-8c38-1603143eeea7)
# ## https://www.tensorflow.org/guide/mixed_precision ## TF mixed precision - pytorch requires other setup
# from tensorflow.keras.mixed_precision import experimental as mixed_precision
# policy = mixed_precision.Policy('mixed_float16')
# mixed_precision.set_policy(policy)
# ## will need to correct in places, e.g.:
# ## outputs = layers.Activation('softmax', dtype='float32', name='predictions')(x)
Aggregate feats:
We should create a dictionary of the rank, count, city/country etc' feats, so we can easily merge them when making more "negative" samples/feats for ranking.
Leaky or potentially leaky (Dependso n test set):
# MIN_TARGET_FREQ = 40 # drop target/city_id values that appear less than this many times, as final step's target
# KEEP_TOP_K_TARGETS = 2000 # keep K most frequent city ID targets (redundnat with the above, )
## (some) categorical variables that appear less than this many times will be replaced with a placeholder value!
## Includes CITY id (but done after target filtering, to avoid creating a "rare class" target:
# LOW_COUNT_THRESH = 8
TARGET_COL = "label"
RUN_TABNET = False
max_epochs = 800
FASTRUN = True
# most basic categorical columns , without 'user_id', , 'utrip_id' ordevice_class - used for count encoding/filtering
BASE_CAT_COLS = ['city_id', 'affiliate_id', 'booker_country', 'hotel_country']
### features to get lags for. Not very robust. May want different feats for lags before -1
LAG_FEAT_COLS = ['city_id', 'device_class',
'affiliate_id', 'booker_country', 'hotel_country',
'duration', 'same_country', 'checkin_weekday',
'checkin_week',
'checkout_weekday',
'city_id_count', 'affiliate_id_count',
# 'booker_country_count',
'hotel_country_count',
'checkin_week_count',
'city_id_rank_by_hotel_country',
'city_id_rank_by_booker_country',
'city_id_rank_by_affiliate',
'affiliate_id_rank_by_hotel_country',
# 'affiliate_id_rank_by_booker_country',
# 'booker_country_rank_by_hotel_country',
# 'booker_country_rank_by_booker_country',
# 'booker_country_rank_by_affiliate',
# 'hotel_country_rank_by_hotel_country',
'hotel_country_rank_by_booker_country',
'hotel_country_rank_by_affiliate',
# 'checkin_month_rank_by_hotel_country',
# 'checkin_month_rank_by_booker_country',
# 'checkin_month_rank_by_affiliate'
]
CAT_FEAT_NAMES = ["booker_country", "device_class","affiliate_id",
"city_id","hotel_country",
"utrip_id",
# "user_id", ## ? could use lower dim - depends on train/test overlap
"checkin_week",#"checkout_week",
# "checkin_weekday",
"lag1_city_id","lag1_booker_country","lag1_hotel_country","lag1_affiliate_id", "lag1_device_class",
"lag2_city_id","lag2_booker_country","lag2_hotel_country","lag2_affiliate_id","lag2_device_class",
# "lag3_city_id","lag3_booker_country","lag3_hotel_country","lag3_affiliate_id","lag3_device_class",
"first_hotel_country","first_city_id"
]
# https://stackoverflow.com/questions/33907537/groupby-and-lag-all-columns-of-a-dataframe
# https://stackoverflow.com/questions/62924987/lag-multiple-variables-grouped-by-columns
## lag features with groupby over many columns:
def groupbyLagFeatures(df:pd.DataFrame,lag:[]=[1,2],group="utrip_id",lag_feature_cols=[]):
"""
lag features with groupby over many columns
https://stackoverflow.com/questions/62924987/lag-multiple-variables-grouped-by-columns"""
if len(lag_feature_cols)>0:
df=pd.concat([df]+[df.groupby(group)[lag_feature_cols].shift(x).add_prefix('lag'+str(x)+"_") for x in lag],axis=1)
else:
df=pd.concat([df]+[df.groupby(group).shift(x).add_prefix('lag'+str(x)+"_") for x in lag],axis=1)
return df
def groupbyFirstLagFeatures(df:pd.DataFrame,group="user_id",lag_feature_cols=[]):
"""
Get first/head value lag-like of features with groupby over columns. Assumes sorted data!
"""
if len(lag_feature_cols)>0:
df=pd.concat([df]+[df.groupby(group)[lag_feature_cols].transform("first").add_prefix("first_")],axis=1)
else:
# df=pd.concat([df]+[df.groupby(group).first().add_prefix("first_")],axis=1)
df=pd.concat([df]+[df.groupby(group).transform("first").add_prefix("first_")],axis=1)
return df
######## Get n most popular items, per group
def most_popular(group, n_max=4):
"""Find most popular hotel clusters by destination
Define a function to get most popular hotels for a destination group.
Previous version used nlargest() Series method to get indices of largest elements. But the method is rather slow.
Source: https://www.kaggle.com/dvasyukova/predict-hotel-type-with-pandas
"""
relevance = group['relevance'].values
hotel_cluster = group['hotel_cluster'].values
most_popular = hotel_cluster[np.argsort(relevance)[::-1]][:n_max]
return np.array_str(most_popular)[1:-1] # remove square brackets
from tensorflow.keras.metrics import top_k_categorical_accuracy
def top_4_accuracy(y_true, y_pred):
"""will only work if doing multiclass predictions"""
return top_k_categorical_accuracy(y_true, y_pred, k=4)
## https://codereview.stackexchange.com/questions/149306/select-the-n-most-frequent-items-from-a-pandas-groupby-dataframe
# https://stackoverflow.com/questions/52073054/group-by-a-column-to-find-the-most-frequent-value-in-another-column
## can get modes (sorted)
# https://stackoverflow.com/questions/50592762/finding-most-common-values-with-pandas-groupby-and-value-counts
## df.groupby('tag')['category'].agg(lambda x: x.value_counts().index[0])
# https://stackoverflow.com/questions/15222754/groupby-pandas-dataframe-and-select-most-common-value
# source2.groupby(['Country','City'])['Short name'].agg(pd.Series.mode)
dtypes_dict = {
# "utrip_id":"category",
"city_id":"int32",
"rank":"int16",
"label":"int8",
"affiliate_id":"int32",
# "device_class":"category",
# "booker_country":"string", ## category would save more memory but intereferes with feature engineering
# "hotel_country":"string"
}
TRAIN_FILE_PATH = "/content/drive/MyDrive/booking_wisdom/booking_train_set.csv" #"booking_train_set.csv"
LIST_FILE_PATH = "/content/drive/MyDrive/booking_wisdom/list_booking_train.csv.gz" #"list_booking_train.csv.gz"
if FASTRUN:
df = pd.read_csv(TRAIN_FILE_PATH,
nrows=200_000,
index_col=[0],
parse_dates=["checkin","checkout"],infer_datetime_format=True, dtype=dtypes_dict)
else:
df = pd.read_csv(TRAIN_FILE_PATH,
index_col=[0],
parse_dates=["checkin","checkout"],infer_datetime_format=True, dtype=dtypes_dict)
## list of candidate predictions (100 per utrip_id)
df_list = pd.read_csv(LIST_FILE_PATH,
usecols=[0,2,3], # skip city_id col
dtype=dtypes_dict)
print("rank of labels (target =1):")
print(df_list.loc[df_list["label"]==1]["rank"].describe())
df_list.set_index("utrip_id",inplace=True)
# # ## HACK due to running out of mempory !
# df_list = df_list.loc[df_list["rank"]<=33]
if FASTRUN:
df_list = df_list.loc[(df_list["rank"]<=40) | (df_list["label"]==1)]
df.sort_values(["user_id","checkin"],inplace=True)
# df_list.set_index("utrip_id",inplace=True)
display(df)
display(df_list)
rank of labels (target =1): count 191671.000000 mean 11.927016 std 18.841272 min 0.000000 25% 1.000000 50% 4.000000 75% 14.000000 max 99.000000 Name: rank, dtype: float64
| user_id | checkin | checkout | city_id | device_class | affiliate_id | booker_country | hotel_country | utrip_id | |
|---|---|---|---|---|---|---|---|---|---|
| 117277 | 136 | 2016-09-20 | 2016-09-22 | 52933 | desktop | 9924 | The Devilfire Empire | Osterlich | 136_4 |
| 117278 | 136 | 2016-09-22 | 2016-09-23 | 51685 | desktop | 9924 | The Devilfire Empire | Osterlich | 136_4 |
| 117279 | 136 | 2016-09-23 | 2016-09-24 | 43323 | desktop | 9924 | The Devilfire Empire | Osterlich | 136_4 |
| 117280 | 136 | 2016-09-24 | 2016-09-26 | 55990 | desktop | 9924 | The Devilfire Empire | Osterlich | 136_4 |
| 117281 | 136 | 2016-09-26 | 2016-09-27 | 46411 | desktop | 9924 | The Devilfire Empire | Osterlich | 136_4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 64260 | 6257973 | 2016-08-12 | 2016-08-15 | 15470 | tablet | 5755 | Gondal | Rolisica | 6257973_1 |
| 180788 | 6258041 | 2016-04-28 | 2016-04-29 | 57109 | mobile | 9452 | Elbonia | Glubbdubdrib | 6258041_1 |
| 180789 | 6258041 | 2016-04-29 | 2016-04-30 | 57109 | mobile | 9452 | Elbonia | Glubbdubdrib | 6258041_1 |
| 180790 | 6258041 | 2016-04-30 | 2016-05-01 | 7529 | mobile | 9452 | Elbonia | Glubbdubdrib | 6258041_1 |
| 180791 | 6258041 | 2016-05-01 | 2016-05-02 | 17338 | mobile | 9452 | Elbonia | Glubbdubdrib | 6258041_1 |
200000 rows × 9 columns
| rank | label | |
|---|---|---|
| utrip_id | ||
| 727105_1 | 0 | 0 |
| 727105_1 | 1 | 0 |
| 727105_1 | 2 | 0 |
| 727105_1 | 3 | 0 |
| 727105_1 | 4 | 0 |
| ... | ... | ... |
| 2730109_1 | 36 | 0 |
| 2730109_1 | 37 | 0 |
| 2730109_1 | 38 | 0 |
| 2730109_1 | 39 | 0 |
| 2730109_1 | 40 | 0 |
8350752 rows × 2 columns
df["duration"] = ((df["checkout"] - df["checkin"]).dt.days).astype(int)
df["same_country"] = (df["booker_country"]==df["hotel_country"]).astype(int)
df["checkin_day"] = df["checkin"].dt.day
df["checkin_weekday"] = df["checkin"].dt.weekday
df["checkin_week"] = df["checkin"].dt.isocalendar().week.astype(int) ## week of year
df["checkin_month"] = df["checkin"].dt.month
df["checkin_year"] = df["checkin"].dt.year-2015
df["checkin_day_of_year"] = df["checkin"].dt.dayofyear # day of year - may overfit , but will allow learning holidays and other checkins on that date
df["checkin_quarter"] = df["checkin"].dt.quarter # relatively redundant but may be used for "id"
df["checkout_weekday"] = df["checkout"].dt.weekday
df["checkout_week"] = df["checkout"].dt.isocalendar().week.astype(int) ## week of year
df["checkout_day"] = df["checkout"].dt.day ## day of month
## cyclical datetime embeddings
## drop originakl variables?
## TODO:L add for other variables, +- those that we'll embed (week?)
# df['checkin_weekday_sin'] = np.sin(df["checkin_weekday"]*(2.*np.pi/7))
# df['checkin_weekday_cos'] = np.cos(df["checkin_weekday"]*(2.*np.pi/7))
df['checkin_month_sin'] = np.sin((df["checkin_month"]-1)*(2.*np.pi/12))
df['checkin_month_cos'] = np.cos((df["checkin_month"]-1)*(2.*np.pi/12))
df['checkin_week_sin'] = np.sin((df["checkin_week"]-1)*(2.*np.pi/53))
df['checkin_week_cos'] = np.cos((df["checkin_week"]-1)*(2.*np.pi/53))
# #############
# # last number in utrip id - probably which trip number it is:
# df["utrip_number"] = df["utrip_id"].str.split("_",expand=True)[1].astype(int)
### encode string columns - must be consistent with test data
### IF we can concat test with train, we can just do a single transformation for the NON TARGET cols
# obj_cols_list = df.select_dtypes("O").columns.values
obj_cols_list = ['device_class','booker_country','hotel_country'] # we could also define when loading data, dtype
# for c in obj_cols_list:
# df[c] = df[c].astype("category")
# df[c] = df[c].cat.codes.astype(int)
## view steps of a trip per user & trip, in order. ## last step == 1.
## count #/pct step in a trip (utrip_id) per user. Useful to get the "final" step per trip - for prediction
## note that the order is ascending, so we would need to select by "last" . (i.e "1" is the first step, 2 the second, etc') , or we could use pct .rank(ascending=True,pct=True)
#### this feature overlaps with the count of each trip id (for the final row)
## = df.sort_values(["checkin","checkout"])... - df already sorted above
# df["utrip_steps_from_end"] = df.groupby("utrip_id")["checkin"].rank(ascending=True,pct=True) #.cumcount("user_id")
### add features to be consistent with test set of row in trip, and total trips in trip
df["row_num"] = df.groupby("utrip_id")["checkin"].rank(ascending=True,pct=False).astype(int)
utrip_counts = df["utrip_id"].value_counts()
df["total_rows"] = df["utrip_id"].map(utrip_counts)
df[["row_num","total_rows"]].describe()
| row_num | total_rows | |
|---|---|---|
| count | 200000.000000 | 200000.000000 |
| mean | 3.551565 | 6.103130 |
| std | 2.372298 | 2.796659 |
| min | 1.000000 | 1.000000 |
| 25% | 2.000000 | 4.000000 |
| 50% | 3.000000 | 5.000000 |
| 75% | 5.000000 | 7.000000 |
| max | 48.000000 | 48.000000 |
df["last"] = (df["row_num"] ==df["total_rows"]).astype(int)
## add the "first" place visited/values
### nopte - will need to drop first row in trip, or impute nans when using this feature
### first by user results in too much sparsity/rareness for our IDs purposes
df = groupbyFirstLagFeatures(df,group="utrip_id",lag_feature_cols=["hotel_country","city_id","duration","same_country"]) # ["hotel_country","city_id"]
# df = df.loc[df["row_num"]>1] ## can't do yet, needed for lag features
print(df[["first_hotel_country","hotel_country","city_id"]].nunique())
df
first_hotel_country 141 hotel_country 162 city_id 20148 dtype: int64
| user_id | checkin | checkout | city_id | device_class | affiliate_id | booker_country | hotel_country | utrip_id | duration | same_country | checkin_day | checkin_weekday | checkin_week | checkin_month | checkin_year | checkin_day_of_year | checkin_quarter | checkout_weekday | checkout_week | checkout_day | checkin_month_sin | checkin_month_cos | checkin_week_sin | checkin_week_cos | row_num | total_rows | last | first_hotel_country | first_city_id | first_duration | first_same_country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 117277 | 136 | 2016-09-20 | 2016-09-22 | 52933 | desktop | 9924 | The Devilfire Empire | Osterlich | 136_4 | 2 | 0 | 20 | 1 | 38 | 9 | 1 | 264 | 3 | 3 | 38 | 22 | -0.866025 | -5.000000e-01 | -0.947326 | -0.320270 | 1 | 7 | 0 | Osterlich | 52933 | 2 | 0 |
| 117278 | 136 | 2016-09-22 | 2016-09-23 | 51685 | desktop | 9924 | The Devilfire Empire | Osterlich | 136_4 | 1 | 0 | 22 | 3 | 38 | 9 | 1 | 266 | 3 | 4 | 38 | 23 | -0.866025 | -5.000000e-01 | -0.947326 | -0.320270 | 2 | 7 | 0 | Osterlich | 52933 | 2 | 0 |
| 117279 | 136 | 2016-09-23 | 2016-09-24 | 43323 | desktop | 9924 | The Devilfire Empire | Osterlich | 136_4 | 1 | 0 | 23 | 4 | 38 | 9 | 1 | 267 | 3 | 5 | 38 | 24 | -0.866025 | -5.000000e-01 | -0.947326 | -0.320270 | 3 | 7 | 0 | Osterlich | 52933 | 2 | 0 |
| 117280 | 136 | 2016-09-24 | 2016-09-26 | 55990 | desktop | 9924 | The Devilfire Empire | Osterlich | 136_4 | 2 | 0 | 24 | 5 | 38 | 9 | 1 | 268 | 3 | 0 | 39 | 26 | -0.866025 | -5.000000e-01 | -0.947326 | -0.320270 | 4 | 7 | 0 | Osterlich | 52933 | 2 | 0 |
| 117281 | 136 | 2016-09-26 | 2016-09-27 | 46411 | desktop | 9924 | The Devilfire Empire | Osterlich | 136_4 | 1 | 0 | 26 | 0 | 39 | 9 | 1 | 270 | 3 | 1 | 39 | 27 | -0.866025 | -5.000000e-01 | -0.978556 | -0.205979 | 5 | 7 | 0 | Osterlich | 52933 | 2 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 64260 | 6257973 | 2016-08-12 | 2016-08-15 | 15470 | tablet | 5755 | Gondal | Rolisica | 6257973_1 | 3 | 0 | 12 | 4 | 32 | 8 | 1 | 225 | 3 | 0 | 33 | 15 | -0.500000 | -8.660254e-01 | -0.508531 | -0.861044 | 4 | 4 | 1 | Rolisica | 53238 | 1 | 0 |
| 180788 | 6258041 | 2016-04-28 | 2016-04-29 | 57109 | mobile | 9452 | Elbonia | Glubbdubdrib | 6258041_1 | 1 | 0 | 28 | 3 | 17 | 4 | 1 | 119 | 2 | 4 | 17 | 29 | 1.000000 | 6.123234e-17 | 0.947326 | -0.320270 | 1 | 4 | 0 | Glubbdubdrib | 57109 | 1 | 0 |
| 180789 | 6258041 | 2016-04-29 | 2016-04-30 | 57109 | mobile | 9452 | Elbonia | Glubbdubdrib | 6258041_1 | 1 | 0 | 29 | 4 | 17 | 4 | 1 | 120 | 2 | 5 | 17 | 30 | 1.000000 | 6.123234e-17 | 0.947326 | -0.320270 | 2 | 4 | 0 | Glubbdubdrib | 57109 | 1 | 0 |
| 180790 | 6258041 | 2016-04-30 | 2016-05-01 | 7529 | mobile | 9452 | Elbonia | Glubbdubdrib | 6258041_1 | 1 | 0 | 30 | 5 | 17 | 4 | 1 | 121 | 2 | 6 | 17 | 1 | 1.000000 | 6.123234e-17 | 0.947326 | -0.320270 | 3 | 4 | 0 | Glubbdubdrib | 57109 | 1 | 0 |
| 180791 | 6258041 | 2016-05-01 | 2016-05-02 | 17338 | mobile | 9452 | Elbonia | Glubbdubdrib | 6258041_1 | 1 | 0 | 1 | 6 | 17 | 5 | 1 | 122 | 2 | 0 | 18 | 2 | 0.866025 | -5.000000e-01 | 0.947326 | -0.320270 | 4 | 4 | 1 | Glubbdubdrib | 57109 | 1 | 0 |
200000 rows × 32 columns
### replace rare variables (under 2 occurrences) with "-1" dummy
affiliates_counts = df["affiliate_id"].value_counts()
print("before:", affiliates_counts)
print("uniques",df["affiliate_id"].nunique())
affiliates_counts = affiliates_counts.to_dict()
# df["affiliate_id"] = df["affiliate_id"].where(df["affiliate_id"].apply(lambda x: x.map(x.value_counts()))>=3, -1)
df["affiliate_id"] = df["affiliate_id"].where(df["affiliate_id"].map(affiliates_counts)>=3, -1)
df["affiliate_id"] = df["affiliate_id"].astype(int)
print("after\n",df["affiliate_id"].value_counts())
print("uniques",df["affiliate_id"].nunique())
before: 9924 47574
359 29601
384 15200
9452 14618
4541 7270
...
1858 1
5696 1
3614 1
3699 1
4094 1
Name: affiliate_id, Length: 1589, dtype: int64
uniques 1589
after
9924 47574
359 29601
384 15200
9452 14618
4541 7270
...
10185 3
7467 3
8329 3
364 3
9113 3
Name: affiliate_id, Length: 974, dtype: int64
uniques 974
### for possible "user id" embedding/ID : How many unique values are there for these source tuple? :
### Could also maybe add previous location/lag1 country/city ?
## 'device_class','affiliate_id', 'booker_country' - 7.5 K "uniques"
## 'device_class','affiliate_id', 'booker_country','checkin_month' - 24 K "uniques"
## 'device_class','affiliate_id', 'booker_country','checkin_quarter' 14K "uniques"
print(df[['device_class','affiliate_id', 'booker_country','checkin_month',"total_rows"]].nunique(axis=0))
df.groupby(['device_class','affiliate_id', 'booker_country','checkin_quarter']).size()
device_class 3 affiliate_id 974 booker_country 5 checkin_month 12 total_rows 29 dtype: int64
device_class affiliate_id booker_country checkin_quarter
desktop -1 Bartovia 2 3
3 13
4 13
Elbonia 1 22
2 32
..
tablet 10582 Elbonia 2 2
3 5
10615 Elbonia 2 2
3 5
10668 Gondal 1 1
Length: 5012, dtype: int64
##### Following aggregation features - would be best to use time window (sort data) to generate, otherwise they will LEAK! (e.g. nunique countries visited)
### count features (can also later add rank inside groups).
### Some may be leaks (# visits in a trip should use time window?) , and do users repeat?
### can add more counts of group X time period (e.g. affiliate X month of year)
## alt way to get counts/freq :
# freq = df["city_id"].value_counts()
# df["city_id_count"] = df["city_id"].map(freq)
# print(df["city_id_count"].describe())
count_cols = [ 'city_id','affiliate_id', 'hotel_country',
# 'utrip_id','user_id',
"checkin_month",
"checkin_week"]
for c in count_cols:
df[f"{c}_count"] = df.groupby([c])["duration"].transform("size")
########################################################
## nunique per trip
### https://stackoverflow.com/questions/46470743/how-to-efficiently-compute-a-rolling-unique-count-in-a-pandas-time-series
nunique_cols = [ 'city_id', 'hotel_country','affiliate_id', 'booker_country']
# df["nunique_booker_countries"] = df.groupby("utrip_id")["booker_country"].nunique()
# df["nunique_hotel_country"] = df.groupby("utrip_id")["hotel_country"].nunique()
for c in nunique_cols:
df[f"{c}_nunique"] = df.groupby(["utrip_id"])[c].transform("nunique")
print(df.nunique())
########################################################
## get frequency/count feature's rank within a group - e.g. within a country (or affiliate)
## add "_count" to column name to get count col name, then add rank col
### ALT/ duplicate feat - add percent rank (instead or in addition)
rank_cols = ['city_id','affiliate_id', 'hotel_country', # 'booker_country',
"checkin_month"]
### what is meaning of groupby and rank of smae variable by same var? Surely should be 1 / unary?
for c in rank_cols:
df[f"{c}_rank_by_hotel_country"] = df.groupby(['hotel_country'])[f"{c}_count"].transform("rank")
df[f"{c}_rank_by_booker_country"] = df.groupby(['booker_country'])[f"{c}_count"].transform("rank")
df[f"{c}_rank_by_affiliate"] = df.groupby(['affiliate_id'])[f"{c}_count"].transform("rank")
df
user_id 36694 checkin 425 checkout 425 city_id 20148 device_class 3 affiliate_id 974 booker_country 5 hotel_country 162 utrip_id 37353 duration 28 same_country 2 checkin_day 31 checkin_weekday 7 checkin_week 53 checkin_month 12 checkin_year 3 checkin_day_of_year 366 checkin_quarter 4 checkout_weekday 7 checkout_week 53 checkout_day 31 checkin_month_sin 12 checkin_month_cos 12 checkin_week_sin 53 checkin_week_cos 45 row_num 48 total_rows 29 last 2 first_hotel_country 141 first_city_id 7727 first_duration 23 first_same_country 2 city_id_count 303 affiliate_id_count 181 hotel_country_count 114 checkin_month_count 12 checkin_week_count 52 city_id_nunique 25 hotel_country_nunique 10 affiliate_id_nunique 9 booker_country_nunique 2 dtype: int64
| user_id | checkin | checkout | city_id | device_class | affiliate_id | booker_country | hotel_country | utrip_id | duration | same_country | checkin_day | checkin_weekday | checkin_week | checkin_month | checkin_year | checkin_day_of_year | checkin_quarter | checkout_weekday | checkout_week | checkout_day | checkin_month_sin | checkin_month_cos | checkin_week_sin | checkin_week_cos | row_num | total_rows | last | first_hotel_country | first_city_id | first_duration | first_same_country | city_id_count | affiliate_id_count | hotel_country_count | checkin_month_count | checkin_week_count | city_id_nunique | hotel_country_nunique | affiliate_id_nunique | booker_country_nunique | city_id_rank_by_hotel_country | city_id_rank_by_booker_country | city_id_rank_by_affiliate | affiliate_id_rank_by_hotel_country | affiliate_id_rank_by_booker_country | affiliate_id_rank_by_affiliate | hotel_country_rank_by_hotel_country | hotel_country_rank_by_booker_country | hotel_country_rank_by_affiliate | checkin_month_rank_by_hotel_country | checkin_month_rank_by_booker_country | checkin_month_rank_by_affiliate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 117277 | 136 | 2016-09-20 | 2016-09-22 | 52933 | desktop | 9924 | The Devilfire Empire | Osterlich | 136_4 | 2 | 0 | 20 | 1 | 38 | 9 | 1 | 264 | 3 | 3 | 38 | 22 | -0.866025 | -5.000000e-01 | -0.947326 | -0.320270 | 1 | 7 | 0 | Osterlich | 52933 | 2 | 0 | 207 | 47574 | 3203 | 24055 | 5127 | 7 | 1 | 1 | 1 | 2673.0 | 24590.5 | 31435.5 | 2733.0 | 42604.5 | 23787.5 | 1602.0 | 10345.5 | 11777.0 | 1930.0 | 34655.0 | 29146.0 |
| 117278 | 136 | 2016-09-22 | 2016-09-23 | 51685 | desktop | 9924 | The Devilfire Empire | Osterlich | 136_4 | 1 | 0 | 22 | 3 | 38 | 9 | 1 | 266 | 3 | 4 | 38 | 23 | -0.866025 | -5.000000e-01 | -0.947326 | -0.320270 | 2 | 7 | 0 | Osterlich | 52933 | 2 | 0 | 64 | 47574 | 3203 | 24055 | 5127 | 7 | 1 | 1 | 1 | 1712.5 | 12595.0 | 21033.0 | 2733.0 | 42604.5 | 23787.5 | 1602.0 | 10345.5 | 11777.0 | 1930.0 | 34655.0 | 29146.0 |
| 117279 | 136 | 2016-09-23 | 2016-09-24 | 43323 | desktop | 9924 | The Devilfire Empire | Osterlich | 136_4 | 1 | 0 | 23 | 4 | 38 | 9 | 1 | 267 | 3 | 5 | 38 | 24 | -0.866025 | -5.000000e-01 | -0.947326 | -0.320270 | 3 | 7 | 0 | Osterlich | 52933 | 2 | 0 | 10 | 47574 | 3203 | 24055 | 5127 | 7 | 1 | 1 | 1 | 592.5 | 4498.0 | 9755.5 | 2733.0 | 42604.5 | 23787.5 | 1602.0 | 10345.5 | 11777.0 | 1930.0 | 34655.0 | 29146.0 |
| 117280 | 136 | 2016-09-24 | 2016-09-26 | 55990 | desktop | 9924 | The Devilfire Empire | Osterlich | 136_4 | 2 | 0 | 24 | 5 | 38 | 9 | 1 | 268 | 3 | 0 | 39 | 26 | -0.866025 | -5.000000e-01 | -0.947326 | -0.320270 | 4 | 7 | 0 | Osterlich | 52933 | 2 | 0 | 28 | 47574 | 3203 | 24055 | 5127 | 7 | 1 | 1 | 1 | 1058.5 | 8005.0 | 15166.5 | 2733.0 | 42604.5 | 23787.5 | 1602.0 | 10345.5 | 11777.0 | 1930.0 | 34655.0 | 29146.0 |
| 117281 | 136 | 2016-09-26 | 2016-09-27 | 46411 | desktop | 9924 | The Devilfire Empire | Osterlich | 136_4 | 1 | 0 | 26 | 0 | 39 | 9 | 1 | 270 | 3 | 1 | 39 | 27 | -0.866025 | -5.000000e-01 | -0.978556 | -0.205979 | 5 | 7 | 0 | Osterlich | 52933 | 2 | 0 | 154 | 47574 | 3203 | 24055 | 6396 | 7 | 1 | 1 | 1 | 2291.5 | 20737.5 | 28377.5 | 2733.0 | 42604.5 | 23787.5 | 1602.0 | 10345.5 | 11777.0 | 1930.0 | 34655.0 | 29146.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 64260 | 6257973 | 2016-08-12 | 2016-08-15 | 15470 | tablet | 5755 | Gondal | Rolisica | 6257973_1 | 3 | 0 | 12 | 4 | 32 | 8 | 1 | 225 | 3 | 0 | 33 | 15 | -0.500000 | -8.660254e-01 | -0.508531 | -0.861044 | 4 | 4 | 1 | Rolisica | 53238 | 1 | 0 | 59 | 3085 | 3845 | 38732 | 10634 | 4 | 1 | 3 | 1 | 516.0 | 45165.0 | 1364.5 | 1063.0 | 25999.0 | 1543.0 | 1923.0 | 24934.0 | 771.0 | 3536.0 | 81141.5 | 2716.0 |
| 180788 | 6258041 | 2016-04-28 | 2016-04-29 | 57109 | mobile | 9452 | Elbonia | Glubbdubdrib | 6258041_1 | 1 | 0 | 28 | 3 | 17 | 4 | 1 | 119 | 2 | 4 | 17 | 29 | 1.000000 | 6.123234e-17 | 0.947326 | -0.320270 | 1 | 4 | 0 | Glubbdubdrib | 57109 | 1 | 0 | 13 | 14618 | 12732 | 12096 | 3131 | 3 | 1 | 1 | 1 | 3395.5 | 12946.5 | 3411.0 | 7116.0 | 20382.0 | 7309.5 | 6366.5 | 26918.5 | 10439.0 | 2934.0 | 9694.5 | 4298.5 |
| 180789 | 6258041 | 2016-04-29 | 2016-04-30 | 57109 | mobile | 9452 | Elbonia | Glubbdubdrib | 6258041_1 | 1 | 0 | 29 | 4 | 17 | 4 | 1 | 120 | 2 | 5 | 17 | 30 | 1.000000 | 6.123234e-17 | 0.947326 | -0.320270 | 2 | 4 | 0 | Glubbdubdrib | 57109 | 1 | 0 | 13 | 14618 | 12732 | 12096 | 3131 | 3 | 1 | 1 | 1 | 3395.5 | 12946.5 | 3411.0 | 7116.0 | 20382.0 | 7309.5 | 6366.5 | 26918.5 | 10439.0 | 2934.0 | 9694.5 | 4298.5 |
| 180790 | 6258041 | 2016-04-30 | 2016-05-01 | 7529 | mobile | 9452 | Elbonia | Glubbdubdrib | 6258041_1 | 1 | 0 | 30 | 5 | 17 | 4 | 1 | 121 | 2 | 6 | 17 | 1 | 1.000000 | 6.123234e-17 | 0.947326 | -0.320270 | 3 | 4 | 0 | Glubbdubdrib | 57109 | 1 | 0 | 3 | 14618 | 12732 | 12096 | 3131 | 3 | 1 | 1 | 1 | 1343.0 | 5710.0 | 1366.0 | 7116.0 | 20382.0 | 7309.5 | 6366.5 | 26918.5 | 10439.0 | 2934.0 | 9694.5 | 4298.5 |
| 180791 | 6258041 | 2016-05-01 | 2016-05-02 | 17338 | mobile | 9452 | Elbonia | Glubbdubdrib | 6258041_1 | 1 | 0 | 1 | 6 | 17 | 5 | 1 | 122 | 2 | 0 | 18 | 2 | 0.866025 | -5.000000e-01 | 0.947326 | -0.320270 | 4 | 4 | 1 | Glubbdubdrib | 57109 | 1 | 0 | 18 | 14618 | 12732 | 15729 | 3131 | 3 | 1 | 1 | 1 | 3782.5 | 14697.5 | 3972.0 | 7116.0 | 20382.0 | 7309.5 | 6366.5 | 26918.5 | 10439.0 | 4229.5 | 12531.0 | 5303.0 |
200000 rows × 53 columns
WARNING! Some features here are NOT Calculate d correctly - we see too many values, they are not unique per city_id & hotel!!
# CITY_FEATURES = df.columns[(df.columns.str.contains("city|hotel",case=False)) & (~df.columns.str.contains("first_|month",case=False))]
CITY_FEATURES = ['city_id', 'hotel_country', 'city_id_count', 'hotel_country_count',
'city_id_rank_by_hotel_country']
print("city features\n",CITY_FEATURES)
df_city_features = df[CITY_FEATURES].drop_duplicates().set_index("city_id")
assert df["city_id"].nunique() == df_city_features.shape[0]
print(df_city_features.shape)
city features ['city_id', 'hotel_country', 'city_id_count', 'hotel_country_count', 'city_id_rank_by_hotel_country'] (20148, 4)
df.loc[df["city_id_count"]>=10]["city_id"].nunique()
2650
assert df.isna().sum().max() ==0
df[[ 'checkin', 'checkout','booker_country', 'hotel_country', 'duration']].describe(include="all",datetime_is_numeric=True)
| checkin | checkout | booker_country | hotel_country | duration | |
|---|---|---|---|---|---|
| count | 200000 | 200000 | 200000 | 200000 | 200000.000000 |
| unique | NaN | NaN | 5 | 162 | NaN |
| top | NaN | NaN | Gondal | Cobra Island | NaN |
| freq | NaN | NaN | 92328 | 23733 | NaN |
| mean | 2016-08-02 02:20:29.615977984 | 2016-08-03 20:05:58.703989248 | NaN | NaN | 1.739920 |
| min | 2015-12-31 00:00:00 | 2016-01-01 00:00:00 | NaN | NaN | 1.000000 |
| 25% | 2016-06-07 00:00:00 | 2016-06-09 00:00:00 | NaN | NaN | 1.000000 |
| 50% | 2016-08-07 00:00:00 | 2016-08-08 00:00:00 | NaN | NaN | 1.000000 |
| 75% | 2016-09-26 00:00:00 | 2016-09-28 00:00:00 | NaN | NaN | 2.000000 |
| max | 2017-02-27 00:00:00 | 2017-02-28 00:00:00 | NaN | NaN | 30.000000 |
| std | NaN | NaN | NaN | NaN | 1.210217 |
# LAG_FEAT_COLS = ['city_id', 'device_class',
# 'affiliate_id', 'booker_country', 'hotel_country',
# 'duration', 'same_country', 'checkin_day', 'checkin_weekday',
# 'checkin_week',
# 'checkout_weekday','checkout_week',
# 'city_id_count', 'affiliate_id_count',
# 'booker_country_count', 'hotel_country_count',
# 'checkin_month_count', 'checkin_week_count', 'city_id_nunique',
# 'affiliate_id_nunique', 'booker_country_nunique',
# 'hotel_country_nunique', 'city_id_rank_by_hotel_country',
# 'city_id_rank_by_booker_country', 'city_id_rank_by_affiliate',
# 'affiliate_id_rank_by_hotel_country',
# 'affiliate_id_rank_by_booker_country', 'affiliate_id_rank_by_affiliate',
# 'booker_country_rank_by_hotel_country',
# 'booker_country_rank_by_booker_country',
# 'booker_country_rank_by_affiliate',
# 'hotel_country_rank_by_hotel_country',
# 'hotel_country_rank_by_booker_country',
# 'hotel_country_rank_by_affiliate',
# 'checkin_month_rank_by_hotel_country',
# 'checkin_month_rank_by_booker_country',
# 'checkin_month_rank_by_affiliate']
Drop leak features (target values - country, city)
drop instances that lack history (e.g. at least 3d step and onwards) - by dropna in lag feat
user id / split could maybe be by utrip ID ? ? hotel_country as feature### features to drop - not usable, or leaks (e.g. aggregations on target)
DROP_FEATS = ['user_id',
'checkin', 'checkout',
'hotel_country',
'city_id_count','same_country',
# 'utrip_id',
# 'utrip_steps_from_end',
'city_id_count','hotel_country_count',
'city_id_nunique', 'hotel_country_nunique',
'city_id_rank_by_hotel_country','city_id_rank_by_booker_country', 'city_id_rank_by_affiliate',
'affiliate_id_rank_by_hotel_country','affiliate_id_rank_by_booker_country', 'affiliate_id_rank_by_affiliate',
'hotel_country_rank_by_hotel_country',
'hotel_country_rank_by_booker_country','hotel_country_rank_by_affiliate',
'booker_country_rank_by_hotel_country','booker_country_rank_by_booker_country',
'checkin_month_rank_by_hotel_country',
]
# df2.drop(DROP_FEATS,axis=1).columns
df.columns
Index(['user_id', 'checkin', 'checkout', 'city_id', 'device_class',
'affiliate_id', 'booker_country', 'hotel_country', 'utrip_id',
'duration', 'same_country', 'checkin_day', 'checkin_weekday',
'checkin_week', 'checkin_month', 'checkin_year', 'checkin_day_of_year',
'checkin_quarter', 'checkout_weekday', 'checkout_week', 'checkout_day',
'checkin_month_sin', 'checkin_month_cos', 'checkin_week_sin',
'checkin_week_cos', 'row_num', 'total_rows', 'last',
'first_hotel_country', 'first_city_id', 'first_duration',
'first_same_country', 'city_id_count', 'affiliate_id_count',
'hotel_country_count', 'checkin_month_count', 'checkin_week_count',
'city_id_nunique', 'hotel_country_nunique', 'affiliate_id_nunique',
'booker_country_nunique', 'city_id_rank_by_hotel_country',
'city_id_rank_by_booker_country', 'city_id_rank_by_affiliate',
'affiliate_id_rank_by_hotel_country',
'affiliate_id_rank_by_booker_country', 'affiliate_id_rank_by_affiliate',
'hotel_country_rank_by_hotel_country',
'hotel_country_rank_by_booker_country',
'hotel_country_rank_by_affiliate',
'checkin_month_rank_by_hotel_country',
'checkin_month_rank_by_booker_country',
'checkin_month_rank_by_affiliate'],
dtype='object')
print(df.shape)
# ### lag features - last n visits
df_feat = groupbyLagFeatures(df=df.copy(),
lag=[1,2],group="utrip_id",lag_feature_cols=LAG_FEAT_COLS)
# df_feat = df_feat.dropna(subset=["lag2_city_id"]).sample(frac=1)
df_feat = df_feat.loc[df_feat["last"]==1]
# df_feat = df_feat.drop(DROP_FEATS,axis=1,errors="ignore")
print(df_feat.shape)
assert df_feat["utrip_id"].nunique() == df["utrip_id"].nunique()
### impute missing values - for categoricals at least - strings, numbers
OBJECT_COLS = list(df_feat.select_dtypes("O").columns)
df_feat[OBJECT_COLS] = df_feat[OBJECT_COLS].fillna("")
df_feat[CAT_FEAT_NAMES] = df_feat[CAT_FEAT_NAMES].fillna(-1)
## maybe impute numerics? (for integer type, less data usage)
## downcast dtypes
df_feat = df_feat.apply(pd.to_numeric, errors='ignore',downcast="float").apply(pd.to_numeric, errors='ignore',downcast="integer")
df_feat = df_feat.apply(pd.to_numeric, errors='ignore',downcast="float").apply(pd.to_numeric, errors='ignore',downcast="integer")
# df_feat.sort_values(["user_id","utrip_steps_from_end"])
print(df_feat.info())
df_feat
(200000, 53) (37353, 93) <class 'pandas.core.frame.DataFrame'> Int64Index: 37353 entries, 117283 to 180791 Data columns (total 93 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 37353 non-null int32 1 checkin 37353 non-null float32 2 checkout 37353 non-null float32 3 city_id 37353 non-null int32 4 device_class 37353 non-null object 5 affiliate_id 37353 non-null int16 6 booker_country 37353 non-null object 7 hotel_country 37353 non-null object 8 utrip_id 37353 non-null object 9 duration 37353 non-null int8 10 same_country 37353 non-null int8 11 checkin_day 37353 non-null int8 12 checkin_weekday 37353 non-null int8 13 checkin_week 37353 non-null int8 14 checkin_month 37353 non-null int8 15 checkin_year 37353 non-null int8 16 checkin_day_of_year 37353 non-null int16 17 checkin_quarter 37353 non-null int8 18 checkout_weekday 37353 non-null int8 19 checkout_week 37353 non-null int8 20 checkout_day 37353 non-null int8 21 checkin_month_sin 37353 non-null float32 22 checkin_month_cos 37353 non-null float32 23 checkin_week_sin 37353 non-null float32 24 checkin_week_cos 37353 non-null float32 25 row_num 37353 non-null int8 26 total_rows 37353 non-null int8 27 last 37353 non-null int8 28 first_hotel_country 37353 non-null object 29 first_city_id 37353 non-null int32 30 first_duration 37353 non-null int8 31 first_same_country 37353 non-null int8 32 city_id_count 37353 non-null int16 33 affiliate_id_count 37353 non-null int32 34 hotel_country_count 37353 non-null int16 35 checkin_month_count 37353 non-null int32 36 checkin_week_count 37353 non-null int16 37 city_id_nunique 37353 non-null int8 38 hotel_country_nunique 37353 non-null int8 39 affiliate_id_nunique 37353 non-null int8 40 booker_country_nunique 37353 non-null int8 41 city_id_rank_by_hotel_country 37353 non-null float32 42 city_id_rank_by_booker_country 37353 non-null float32 43 city_id_rank_by_affiliate 37353 non-null float32 44 affiliate_id_rank_by_hotel_country 37353 non-null float32 45 affiliate_id_rank_by_booker_country 37353 non-null float32 46 affiliate_id_rank_by_affiliate 37353 non-null float32 47 hotel_country_rank_by_hotel_country 37353 non-null float32 48 hotel_country_rank_by_booker_country 37353 non-null float32 49 hotel_country_rank_by_affiliate 37353 non-null float32 50 checkin_month_rank_by_hotel_country 37353 non-null float32 51 checkin_month_rank_by_booker_country 37353 non-null float32 52 checkin_month_rank_by_affiliate 37353 non-null float32 53 lag1_city_id 37353 non-null int32 54 lag1_device_class 37353 non-null object 55 lag1_affiliate_id 37353 non-null int16 56 lag1_booker_country 37353 non-null object 57 lag1_hotel_country 37353 non-null object 58 lag1_duration 37333 non-null float32 59 lag1_same_country 37333 non-null float32 60 lag1_checkin_weekday 37333 non-null float32 61 lag1_checkin_week 37333 non-null float32 62 lag1_checkout_weekday 37333 non-null float32 63 lag1_city_id_count 37333 non-null float32 64 lag1_affiliate_id_count 37333 non-null float32 65 lag1_hotel_country_count 37333 non-null float32 66 lag1_checkin_week_count 37333 non-null float32 67 lag1_city_id_rank_by_hotel_country 37333 non-null float32 68 lag1_city_id_rank_by_booker_country 37333 non-null float32 69 lag1_city_id_rank_by_affiliate 37333 non-null float32 70 lag1_affiliate_id_rank_by_hotel_country 37333 non-null float32 71 lag1_hotel_country_rank_by_booker_country 37333 non-null float32 72 lag1_hotel_country_rank_by_affiliate 37333 non-null float32 73 lag2_city_id 37353 non-null int32 74 lag2_device_class 37353 non-null object 75 lag2_affiliate_id 37353 non-null int16 76 lag2_booker_country 37353 non-null object 77 lag2_hotel_country 37353 non-null object 78 lag2_duration 37305 non-null float32 79 lag2_same_country 37305 non-null float32 80 lag2_checkin_weekday 37305 non-null float32 81 lag2_checkin_week 37305 non-null float32 82 lag2_checkout_weekday 37305 non-null float32 83 lag2_city_id_count 37305 non-null float32 84 lag2_affiliate_id_count 37305 non-null float32 85 lag2_hotel_country_count 37305 non-null float32 86 lag2_checkin_week_count 37305 non-null float32 87 lag2_city_id_rank_by_hotel_country 37305 non-null float32 88 lag2_city_id_rank_by_booker_country 37305 non-null float32 89 lag2_city_id_rank_by_affiliate 37305 non-null float32 90 lag2_affiliate_id_rank_by_hotel_country 37305 non-null float32 91 lag2_hotel_country_rank_by_booker_country 37305 non-null float32 92 lag2_hotel_country_rank_by_affiliate 37305 non-null float32 dtypes: float32(48), int16(7), int32(7), int8(20), object(11) memory usage: 12.5+ MB None
| user_id | checkin | checkout | city_id | device_class | affiliate_id | booker_country | hotel_country | utrip_id | duration | same_country | checkin_day | checkin_weekday | checkin_week | checkin_month | checkin_year | checkin_day_of_year | checkin_quarter | checkout_weekday | checkout_week | checkout_day | checkin_month_sin | checkin_month_cos | checkin_week_sin | checkin_week_cos | row_num | total_rows | last | first_hotel_country | first_city_id | first_duration | first_same_country | city_id_count | affiliate_id_count | hotel_country_count | checkin_month_count | checkin_week_count | city_id_nunique | hotel_country_nunique | affiliate_id_nunique | booker_country_nunique | city_id_rank_by_hotel_country | city_id_rank_by_booker_country | city_id_rank_by_affiliate | affiliate_id_rank_by_hotel_country | ... | hotel_country_rank_by_booker_country | hotel_country_rank_by_affiliate | checkin_month_rank_by_hotel_country | checkin_month_rank_by_booker_country | checkin_month_rank_by_affiliate | lag1_city_id | lag1_device_class | lag1_affiliate_id | lag1_booker_country | lag1_hotel_country | lag1_duration | lag1_same_country | lag1_checkin_weekday | lag1_checkin_week | lag1_checkout_weekday | lag1_city_id_count | lag1_affiliate_id_count | lag1_hotel_country_count | lag1_checkin_week_count | lag1_city_id_rank_by_hotel_country | lag1_city_id_rank_by_booker_country | lag1_city_id_rank_by_affiliate | lag1_affiliate_id_rank_by_hotel_country | lag1_hotel_country_rank_by_booker_country | lag1_hotel_country_rank_by_affiliate | lag2_city_id | lag2_device_class | lag2_affiliate_id | lag2_booker_country | lag2_hotel_country | lag2_duration | lag2_same_country | lag2_checkin_weekday | lag2_checkin_week | lag2_checkout_weekday | lag2_city_id_count | lag2_affiliate_id_count | lag2_hotel_country_count | lag2_checkin_week_count | lag2_city_id_rank_by_hotel_country | lag2_city_id_rank_by_booker_country | lag2_city_id_rank_by_affiliate | lag2_affiliate_id_rank_by_hotel_country | lag2_hotel_country_rank_by_booker_country | lag2_hotel_country_rank_by_affiliate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 117283 | 136 | 1.475021e+18 | 1.475107e+18 | 41113 | desktop | 9924 | The Devilfire Empire | Osterlich | 136_4 | 1 | 0 | 28 | 2 | 39 | 9 | 1 | 272 | 3 | 3 | 39 | 29 | -0.866025 | -5.000000e-01 | -0.978557 | -0.205979 | 7 | 7 | 1 | Osterlich | 52933 | 2 | 0 | 58 | 47574 | 3203 | 24055 | 6396 | 7 | 1 | 1 | 1 | 1651.5 | 11791.5 | 20218.5 | 2733.0 | ... | 10345.5 | 11777.0 | 1930.0 | 34655.0 | 29146.0 | 45399 | desktop | 9924 | The Devilfire Empire | Osterlich | 1.0 | 0.0 | 1.0 | 39.0 | 2.0 | 23.0 | 47574.0 | 3203.0 | 6396.0 | 924.5 | 7270.5 | 13989.5 | 2733.0 | 10345.5 | 11777.0 | 46411 | desktop | 9924 | The Devilfire Empire | Osterlich | 1.0 | 0.0 | 0.0 | 39.0 | 1.0 | 154.0 | 47574.0 | 3203.0 | 6396.0 | 2291.5 | 20737.5 | 28377.5 | 2733.0 | 10345.5 | 11777.0 |
| 117413 | 149 | 1.486426e+18 | 1.486598e+18 | 28733 | desktop | 5583 | The Devilfire Empire | Kangan | 149_1 | 2 | 0 | 7 | 1 | 6 | 2 | 2 | 38 | 1 | 3 | 6 | 9 | 0.500000 | 8.660254e-01 | 0.558647 | 0.829406 | 4 | 4 | 1 | Kangan | 10485 | 2 | 0 | 92 | 52 | 7668 | 10191 | 3045 | 2 | 1 | 2 | 1 | 1883.5 | 15196.5 | 13.0 | 320.5 | ... | 25491.5 | 23.0 | 3530.5 | 13204.0 | 7.0 | 10485 | desktop | 3417 | The Devilfire Empire | Kangan | 1.0 | 0.0 | 0.0 | 6.0 | 1.0 | 894.0 | 578.0 | 7668.0 | 3045.0 | 5242.5 | 39610.5 | 473.5 | 1080.0 | 25491.5 | 287.0 | 10485 | desktop | 3417 | The Devilfire Empire | Kangan | 1.0 | 0.0 | 6.0 | 5.0 | 0.0 | 894.0 | 578.0 | 7668.0 | 3140.0 | 5242.5 | 39610.5 | 473.5 | 1080.0 | 25491.5 | 287.0 |
| 163180 | 670 | 1.471046e+18 | 1.471478e+18 | 382 | desktop | 9627 | Gondal | Fook Island | 670_2 | 5 | 0 | 13 | 5 | 32 | 8 | 1 | 226 | 3 | 3 | 33 | 18 | -0.500000 | -8.660254e-01 | -0.508531 | -0.861044 | 4 | 4 | 1 | Fook Island | 33408 | 5 | 0 | 351 | 2367 | 20012 | 38732 | 10634 | 4 | 1 | 3 | 1 | 14379.0 | 74807.5 | 1952.0 | 5271.0 | ... | 77266.0 | 1956.0 | 17850.5 | 81141.5 | 2085.0 | 65965 | desktop | 10332 | Gondal | Fook Island | 3.0 | 0.0 | 2.0 | 32.0 | 5.0 | 33.0 | 4623.0 | 20012.0 | 10634.0 | 6927.0 | 36148.0 | 1852.5 | 8178.5 | 77266.0 | 3887.5 | 31723 | tablet | 384 | Gondal | Fook Island | 3.0 | 0.0 | 6.0 | 31.0 | 2.0 | 3.0 | 15200.0 | 20012.0 | 10199.0 | 1955.5 | 9577.0 | 1742.0 | 12241.5 | 77266.0 | 12352.5 |
| 29265 | 734 | 1.454026e+18 | 1.454198e+18 | 53962 | mobile | 2164 | Tcherkistan | Aldovia | 734_1 | 2 | 0 | 29 | 4 | 4 | 1 | 1 | 29 | 1 | 6 | 4 | 31 | 0.000000 | 1.000000e+00 | 0.348202 | 0.937420 | 4 | 4 | 1 | Aldovia | 66584 | 1 | 0 | 1 | 8 | 2107 | 8564 | 2807 | 4 | 1 | 2 | 1 | 29.0 | 121.5 | 1.0 | 24.5 | ... | 4647.0 | 5.0 | 22.5 | 228.0 | 1.5 | 38862 | mobile | 2164 | Tcherkistan | Aldovia | 1.0 | 0.0 | 3.0 | 4.0 | 4.0 | 3.0 | 8.0 | 2107.0 | 2807.0 | 99.5 | 472.0 | 2.0 | 24.5 | 4647.0 | 5.0 | 20923 | mobile | 9598 | Tcherkistan | Aldovia | 1.0 | 0.0 | 2.0 | 4.0 | 3.0 | 12.0 | 4269.0 | 2107.0 | 2807.0 | 258.5 | 1477.5 | 1077.0 | 849.5 | 4647.0 | 831.5 |
| 198786 | 854 | 1.480205e+18 | 1.480291e+18 | 5860 | mobile | 9452 | Elbonia | Kangan | 854_3 | 1 | 0 | 27 | 6 | 47 | 11 | 1 | 332 | 4 | 0 | 48 | 28 | -0.866025 | 5.000000e-01 | -0.737833 | 0.674983 | 4 | 4 | 1 | Sokovia | 6582 | 1 | 0 | 115 | 14618 | 7668 | 9588 | 1939 | 2 | 2 | 2 | 1 | 2191.0 | 26716.0 | 8203.0 | 3516.5 | ... | 20637.5 | 8427.5 | 2673.0 | 6434.0 | 2610.0 | 6582 | mobile | 8132 | Elbonia | Sokovia | 1.0 | 0.0 | 5.0 | 47.0 | 6.0 | 443.0 | 3777.0 | 1889.0 | 1939.0 | 1668.0 | 34368.5 | 3348.5 | 510.0 | 8029.5 | 650.0 | 6582 | mobile | 9452 | Elbonia | Sokovia | 1.0 | 0.0 | 4.0 | 47.0 | 5.0 | 443.0 | 14618.0 | 1889.0 | 1939.0 | 1668.0 | 34368.5 | 11753.0 | 869.5 | 8029.5 | 3391.5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 151966 | 6257581 | 1.471392e+18 | 1.471478e+18 | 22175 | mobile | 4568 | Gondal | Cobra Island | 6257581_1 | 1 | 0 | 17 | 2 | 33 | 8 | 1 | 230 | 3 | 3 | 33 | 18 | -0.500000 | -8.660254e-01 | -0.606800 | -0.794854 | 6 | 6 | 1 | Cobra Island | 45378 | 1 | 0 | 200 | 1305 | 23733 | 38732 | 8967 | 5 | 1 | 5 | 1 | 14364.5 | 66437.5 | 974.0 | 4564.5 | ... | 87652.5 | 1248.5 | 21445.5 | 81141.5 | 1138.0 | 23708 | mobile | 4568 | Gondal | Cobra Island | 2.0 | 0.0 | 0.0 | 33.0 | 2.0 | 103.0 | 1305.0 | 23733.0 | 8967.0 | 11656.0 | 54693.5 | 822.5 | 4564.5 | 87652.5 | 1248.5 | 46409 | mobile | 2322 | Gondal | Cobra Island | 1.0 | 0.0 | 6.0 | 32.0 | 0.0 | 66.0 | 681.0 | 23733.0 | 10634.0 | 9432.5 | 46949.0 | 445.5 | 3630.5 | 87652.5 | 641.5 |
| 58384 | 6257694 | 1.463011e+18 | 1.463530e+18 | 36063 | desktop | 7974 | The Devilfire Empire | Gondal | 6257694_1 | 6 | 0 | 12 | 3 | 19 | 5 | 1 | 133 | 2 | 2 | 20 | 18 | 0.866025 | -5.000000e-01 | 0.845596 | -0.533823 | 8 | 8 | 1 | Fook Island | 64876 | 2 | 0 | 1668 | 3463 | 18343 | 15729 | 3567 | 6 | 2 | 2 | 1 | 17509.5 | 47125.0 | 3325.5 | 5693.5 | ... | 36728.0 | 2675.5 | 4588.5 | 19778.5 | 1488.0 | 9608 | desktop | 7974 | The Devilfire Empire | Fook Island | 2.0 | 0.0 | 1.0 | 19.0 | 3.0 | 872.0 | 3463.0 | 20012.0 | 3567.0 | 15723.5 | 38857.5 | 2620.0 | 6364.5 | 39090.5 | 2867.5 | 17127 | desktop | 7974 | The Devilfire Empire | Fook Island | 1.0 | 0.0 | 0.0 | 19.0 | 1.0 | 1190.0 | 3463.0 | 20012.0 | 3567.0 | 16754.5 | 43757.0 | 3044.5 | 6364.5 | 39090.5 | 2867.5 |
| 98856 | 6257762 | 1.476058e+18 | 1.476230e+18 | 24912 | desktop | 9924 | The Devilfire Empire | Cobra Island | 6257762_1 | 2 | 0 | 10 | 0 | 41 | 10 | 1 | 284 | 4 | 2 | 41 | 12 | -1.000000 | -1.836970e-16 | -0.999561 | 0.029633 | 10 | 10 | 1 | Cobra Island | 55 | 1 | 0 | 1 | 47574 | 23733 | 18412 | 3418 | 9 | 1 | 3 | 1 | 425.0 | 443.0 | 1174.5 | 20582.0 | ... | 44730.5 | 44423.0 | 11022.0 | 28820.5 | 24024.5 | 29319 | desktop | 9924 | The Devilfire Empire | Cobra Island | 2.0 | 0.0 | 5.0 | 40.0 | 0.0 | 1427.0 | 47574.0 | 23733.0 | 5684.0 | 21199.0 | 45504.0 | 45642.5 | 20582.0 | 44730.5 | 44423.0 | 58413 | tablet | 3631 | The Devilfire Empire | Cobra Island | 1.0 | 0.0 | 4.0 | 40.0 | 5.0 | 209.0 | 4285.0 | 23733.0 | 5684.0 | 14773.0 | 24685.5 | 2899.0 | 7776.5 | 44730.5 | 3983.5 |
| 64260 | 6257973 | 1.470960e+18 | 1.471219e+18 | 15470 | tablet | 5755 | Gondal | Rolisica | 6257973_1 | 3 | 0 | 12 | 4 | 32 | 8 | 1 | 225 | 3 | 0 | 33 | 15 | -0.500000 | -8.660254e-01 | -0.508531 | -0.861044 | 4 | 4 | 1 | Rolisica | 53238 | 1 | 0 | 59 | 3085 | 3845 | 38732 | 10634 | 4 | 1 | 3 | 1 | 516.0 | 45165.0 | 1364.5 | 1063.0 | ... | 24934.0 | 771.0 | 3536.0 | 81141.5 | 2716.0 | 6382 | tablet | 5755 | Gondal | Rolisica | 1.0 | 0.0 | 3.0 | 32.0 | 4.0 | 5.0 | 3085.0 | 3845.0 | 10634.0 | 80.0 | 14044.0 | 353.5 | 1063.0 | 24934.0 | 771.0 | 13161 | mobile | 2436 | Gondal | Rolisica | 2.0 | 0.0 | 1.0 | 32.0 | 3.0 | 30.0 | 3066.0 | 3845.0 | 10634.0 | 338.5 | 34654.0 | 1222.5 | 977.0 | 24934.0 | 903.0 |
| 180791 | 6258041 | 1.462061e+18 | 1.462147e+18 | 17338 | mobile | 9452 | Elbonia | Glubbdubdrib | 6258041_1 | 1 | 0 | 1 | 6 | 17 | 5 | 1 | 122 | 2 | 0 | 18 | 2 | 0.866025 | -5.000000e-01 | 0.947326 | -0.320270 | 4 | 4 | 1 | Glubbdubdrib | 57109 | 1 | 0 | 18 | 14618 | 12732 | 15729 | 3131 | 3 | 1 | 1 | 1 | 3782.5 | 14697.5 | 3972.0 | 7116.0 | ... | 26918.5 | 10439.0 | 4229.5 | 12531.0 | 5303.0 | 7529 | mobile | 9452 | Elbonia | Glubbdubdrib | 1.0 | 0.0 | 5.0 | 17.0 | 6.0 | 3.0 | 14618.0 | 12732.0 | 3131.0 | 1343.0 | 5710.0 | 1366.0 | 7116.0 | 26918.5 | 10439.0 | 57109 | mobile | 9452 | Elbonia | Glubbdubdrib | 1.0 | 0.0 | 4.0 | 17.0 | 5.0 | 13.0 | 14618.0 | 12732.0 | 3131.0 | 3395.5 | 12946.5 | 3411.0 | 7116.0 | 26918.5 | 10439.0 |
37353 rows × 93 columns
# df_feat =df_list.drop(["city_id"],axis=1,errors="ignore").merge(df_feat,how="inner",on="utrip_id")
df_feat =df_list.merge(df_feat,how="inner",on="utrip_id")
del( df_list)
print(df_feat.shape)
df_feat
(1433173, 95)
| utrip_id | rank | label | user_id | checkin | checkout | city_id | device_class | affiliate_id | booker_country | hotel_country | duration | same_country | checkin_day | checkin_weekday | checkin_week | checkin_month | checkin_year | checkin_day_of_year | checkin_quarter | checkout_weekday | checkout_week | checkout_day | checkin_month_sin | checkin_month_cos | checkin_week_sin | checkin_week_cos | row_num | total_rows | last | first_hotel_country | first_city_id | first_duration | first_same_country | city_id_count | affiliate_id_count | hotel_country_count | checkin_month_count | checkin_week_count | city_id_nunique | hotel_country_nunique | affiliate_id_nunique | booker_country_nunique | city_id_rank_by_hotel_country | city_id_rank_by_booker_country | ... | hotel_country_rank_by_booker_country | hotel_country_rank_by_affiliate | checkin_month_rank_by_hotel_country | checkin_month_rank_by_booker_country | checkin_month_rank_by_affiliate | lag1_city_id | lag1_device_class | lag1_affiliate_id | lag1_booker_country | lag1_hotel_country | lag1_duration | lag1_same_country | lag1_checkin_weekday | lag1_checkin_week | lag1_checkout_weekday | lag1_city_id_count | lag1_affiliate_id_count | lag1_hotel_country_count | lag1_checkin_week_count | lag1_city_id_rank_by_hotel_country | lag1_city_id_rank_by_booker_country | lag1_city_id_rank_by_affiliate | lag1_affiliate_id_rank_by_hotel_country | lag1_hotel_country_rank_by_booker_country | lag1_hotel_country_rank_by_affiliate | lag2_city_id | lag2_device_class | lag2_affiliate_id | lag2_booker_country | lag2_hotel_country | lag2_duration | lag2_same_country | lag2_checkin_weekday | lag2_checkin_week | lag2_checkout_weekday | lag2_city_id_count | lag2_affiliate_id_count | lag2_hotel_country_count | lag2_checkin_week_count | lag2_city_id_rank_by_hotel_country | lag2_city_id_rank_by_booker_country | lag2_city_id_rank_by_affiliate | lag2_affiliate_id_rank_by_hotel_country | lag2_hotel_country_rank_by_booker_country | lag2_hotel_country_rank_by_affiliate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3554942_1 | 0 | 0 | 3554942 | 1.451866e+18 | 1.451952e+18 | 20967 | mobile | 359 | The Devilfire Empire | Bozatta | 1 | 0 | 4 | 0 | 1 | 1 | 1 | 4 | 1 | 1 | 1 | 5 | 0.0 | 1.000000 | 0.000000 | 1.000000 | 4 | 4 | 1 | Bozatta | 9161 | 1 | 0 | 144 | 29601 | 7285 | 8564 | 1607 | 4 | 1 | 1 | 1 | 5030.5 | 19633.5 | ... | 23082.5 | 14726.0 | 93.0 | 2028.5 | 868.5 | 303 | mobile | 359 | The Devilfire Empire | Bozatta | 1.0 | 0.0 | 6.0 | 53.0 | 0.0 | 4.0 | 29601.0 | 7285.0 | 50.0 | 1154.5 | 2376.5 | 2004.0 | 4846.0 | 23082.5 | 14726.0 | 42359 | mobile | 359 | The Devilfire Empire | Bozatta | 1.0 | 0.0 | 5.0 | 53.0 | 6.0 | 44.0 | 29601.0 | 7285.0 | 50.0 | 3540.5 | 10142.5 | 7751.5 | 4846.0 | 23082.5 | 14726.0 |
| 1 | 3554942_1 | 1 | 0 | 3554942 | 1.451866e+18 | 1.451952e+18 | 20967 | mobile | 359 | The Devilfire Empire | Bozatta | 1 | 0 | 4 | 0 | 1 | 1 | 1 | 4 | 1 | 1 | 1 | 5 | 0.0 | 1.000000 | 0.000000 | 1.000000 | 4 | 4 | 1 | Bozatta | 9161 | 1 | 0 | 144 | 29601 | 7285 | 8564 | 1607 | 4 | 1 | 1 | 1 | 5030.5 | 19633.5 | ... | 23082.5 | 14726.0 | 93.0 | 2028.5 | 868.5 | 303 | mobile | 359 | The Devilfire Empire | Bozatta | 1.0 | 0.0 | 6.0 | 53.0 | 0.0 | 4.0 | 29601.0 | 7285.0 | 50.0 | 1154.5 | 2376.5 | 2004.0 | 4846.0 | 23082.5 | 14726.0 | 42359 | mobile | 359 | The Devilfire Empire | Bozatta | 1.0 | 0.0 | 5.0 | 53.0 | 6.0 | 44.0 | 29601.0 | 7285.0 | 50.0 | 3540.5 | 10142.5 | 7751.5 | 4846.0 | 23082.5 | 14726.0 |
| 2 | 3554942_1 | 2 | 0 | 3554942 | 1.451866e+18 | 1.451952e+18 | 20967 | mobile | 359 | The Devilfire Empire | Bozatta | 1 | 0 | 4 | 0 | 1 | 1 | 1 | 4 | 1 | 1 | 1 | 5 | 0.0 | 1.000000 | 0.000000 | 1.000000 | 4 | 4 | 1 | Bozatta | 9161 | 1 | 0 | 144 | 29601 | 7285 | 8564 | 1607 | 4 | 1 | 1 | 1 | 5030.5 | 19633.5 | ... | 23082.5 | 14726.0 | 93.0 | 2028.5 | 868.5 | 303 | mobile | 359 | The Devilfire Empire | Bozatta | 1.0 | 0.0 | 6.0 | 53.0 | 0.0 | 4.0 | 29601.0 | 7285.0 | 50.0 | 1154.5 | 2376.5 | 2004.0 | 4846.0 | 23082.5 | 14726.0 | 42359 | mobile | 359 | The Devilfire Empire | Bozatta | 1.0 | 0.0 | 5.0 | 53.0 | 6.0 | 44.0 | 29601.0 | 7285.0 | 50.0 | 3540.5 | 10142.5 | 7751.5 | 4846.0 | 23082.5 | 14726.0 |
| 3 | 3554942_1 | 3 | 0 | 3554942 | 1.451866e+18 | 1.451952e+18 | 20967 | mobile | 359 | The Devilfire Empire | Bozatta | 1 | 0 | 4 | 0 | 1 | 1 | 1 | 4 | 1 | 1 | 1 | 5 | 0.0 | 1.000000 | 0.000000 | 1.000000 | 4 | 4 | 1 | Bozatta | 9161 | 1 | 0 | 144 | 29601 | 7285 | 8564 | 1607 | 4 | 1 | 1 | 1 | 5030.5 | 19633.5 | ... | 23082.5 | 14726.0 | 93.0 | 2028.5 | 868.5 | 303 | mobile | 359 | The Devilfire Empire | Bozatta | 1.0 | 0.0 | 6.0 | 53.0 | 0.0 | 4.0 | 29601.0 | 7285.0 | 50.0 | 1154.5 | 2376.5 | 2004.0 | 4846.0 | 23082.5 | 14726.0 | 42359 | mobile | 359 | The Devilfire Empire | Bozatta | 1.0 | 0.0 | 5.0 | 53.0 | 6.0 | 44.0 | 29601.0 | 7285.0 | 50.0 | 3540.5 | 10142.5 | 7751.5 | 4846.0 | 23082.5 | 14726.0 |
| 4 | 3554942_1 | 4 | 0 | 3554942 | 1.451866e+18 | 1.451952e+18 | 20967 | mobile | 359 | The Devilfire Empire | Bozatta | 1 | 0 | 4 | 0 | 1 | 1 | 1 | 4 | 1 | 1 | 1 | 5 | 0.0 | 1.000000 | 0.000000 | 1.000000 | 4 | 4 | 1 | Bozatta | 9161 | 1 | 0 | 144 | 29601 | 7285 | 8564 | 1607 | 4 | 1 | 1 | 1 | 5030.5 | 19633.5 | ... | 23082.5 | 14726.0 | 93.0 | 2028.5 | 868.5 | 303 | mobile | 359 | The Devilfire Empire | Bozatta | 1.0 | 0.0 | 6.0 | 53.0 | 0.0 | 4.0 | 29601.0 | 7285.0 | 50.0 | 1154.5 | 2376.5 | 2004.0 | 4846.0 | 23082.5 | 14726.0 | 42359 | mobile | 359 | The Devilfire Empire | Bozatta | 1.0 | 0.0 | 5.0 | 53.0 | 6.0 | 44.0 | 29601.0 | 7285.0 | 50.0 | 3540.5 | 10142.5 | 7751.5 | 4846.0 | 23082.5 | 14726.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1433168 | 2276261_2 | 36 | 0 | 2276261 | 1.487808e+18 | 1.487894e+18 | 39509 | desktop | 384 | Gondal | Rolisica | 1 | 0 | 23 | 3 | 8 | 2 | 2 | 54 | 1 | 4 | 8 | 24 | 0.5 | 0.866025 | 0.737833 | 0.674983 | 4 | 4 | 1 | Rolisica | 17438 | 1 | 0 | 24 | 15200 | 3845 | 10191 | 1939 | 4 | 1 | 1 | 1 | 240.5 | 31580.0 | ... | 24934.0 | 3911.0 | 1454.5 | 15892.5 | 2380.0 | 6659 | desktop | 384 | Gondal | Rolisica | 1.0 | 0.0 | 2.0 | 8.0 | 3.0 | 131.0 | 15200.0 | 3845.0 | 1939.0 | 1176.0 | 58646.0 | 9952.0 | 2198.0 | 24934.0 | 3911.0 | 67176 | desktop | 384 | Gondal | Rolisica | 1.0 | 0.0 | 1.0 | 8.0 | 2.0 | 176.0 | 15200.0 | 3845.0 | 1939.0 | 1329.5 | 63696.0 | 10780.5 | 2198.0 | 24934.0 | 3911.0 |
| 1433169 | 2276261_2 | 37 | 0 | 2276261 | 1.487808e+18 | 1.487894e+18 | 39509 | desktop | 384 | Gondal | Rolisica | 1 | 0 | 23 | 3 | 8 | 2 | 2 | 54 | 1 | 4 | 8 | 24 | 0.5 | 0.866025 | 0.737833 | 0.674983 | 4 | 4 | 1 | Rolisica | 17438 | 1 | 0 | 24 | 15200 | 3845 | 10191 | 1939 | 4 | 1 | 1 | 1 | 240.5 | 31580.0 | ... | 24934.0 | 3911.0 | 1454.5 | 15892.5 | 2380.0 | 6659 | desktop | 384 | Gondal | Rolisica | 1.0 | 0.0 | 2.0 | 8.0 | 3.0 | 131.0 | 15200.0 | 3845.0 | 1939.0 | 1176.0 | 58646.0 | 9952.0 | 2198.0 | 24934.0 | 3911.0 | 67176 | desktop | 384 | Gondal | Rolisica | 1.0 | 0.0 | 1.0 | 8.0 | 2.0 | 176.0 | 15200.0 | 3845.0 | 1939.0 | 1329.5 | 63696.0 | 10780.5 | 2198.0 | 24934.0 | 3911.0 |
| 1433170 | 2276261_2 | 38 | 0 | 2276261 | 1.487808e+18 | 1.487894e+18 | 39509 | desktop | 384 | Gondal | Rolisica | 1 | 0 | 23 | 3 | 8 | 2 | 2 | 54 | 1 | 4 | 8 | 24 | 0.5 | 0.866025 | 0.737833 | 0.674983 | 4 | 4 | 1 | Rolisica | 17438 | 1 | 0 | 24 | 15200 | 3845 | 10191 | 1939 | 4 | 1 | 1 | 1 | 240.5 | 31580.0 | ... | 24934.0 | 3911.0 | 1454.5 | 15892.5 | 2380.0 | 6659 | desktop | 384 | Gondal | Rolisica | 1.0 | 0.0 | 2.0 | 8.0 | 3.0 | 131.0 | 15200.0 | 3845.0 | 1939.0 | 1176.0 | 58646.0 | 9952.0 | 2198.0 | 24934.0 | 3911.0 | 67176 | desktop | 384 | Gondal | Rolisica | 1.0 | 0.0 | 1.0 | 8.0 | 2.0 | 176.0 | 15200.0 | 3845.0 | 1939.0 | 1329.5 | 63696.0 | 10780.5 | 2198.0 | 24934.0 | 3911.0 |
| 1433171 | 2276261_2 | 39 | 0 | 2276261 | 1.487808e+18 | 1.487894e+18 | 39509 | desktop | 384 | Gondal | Rolisica | 1 | 0 | 23 | 3 | 8 | 2 | 2 | 54 | 1 | 4 | 8 | 24 | 0.5 | 0.866025 | 0.737833 | 0.674983 | 4 | 4 | 1 | Rolisica | 17438 | 1 | 0 | 24 | 15200 | 3845 | 10191 | 1939 | 4 | 1 | 1 | 1 | 240.5 | 31580.0 | ... | 24934.0 | 3911.0 | 1454.5 | 15892.5 | 2380.0 | 6659 | desktop | 384 | Gondal | Rolisica | 1.0 | 0.0 | 2.0 | 8.0 | 3.0 | 131.0 | 15200.0 | 3845.0 | 1939.0 | 1176.0 | 58646.0 | 9952.0 | 2198.0 | 24934.0 | 3911.0 | 67176 | desktop | 384 | Gondal | Rolisica | 1.0 | 0.0 | 1.0 | 8.0 | 2.0 | 176.0 | 15200.0 | 3845.0 | 1939.0 | 1329.5 | 63696.0 | 10780.5 | 2198.0 | 24934.0 | 3911.0 |
| 1433172 | 2276261_2 | 40 | 0 | 2276261 | 1.487808e+18 | 1.487894e+18 | 39509 | desktop | 384 | Gondal | Rolisica | 1 | 0 | 23 | 3 | 8 | 2 | 2 | 54 | 1 | 4 | 8 | 24 | 0.5 | 0.866025 | 0.737833 | 0.674983 | 4 | 4 | 1 | Rolisica | 17438 | 1 | 0 | 24 | 15200 | 3845 | 10191 | 1939 | 4 | 1 | 1 | 1 | 240.5 | 31580.0 | ... | 24934.0 | 3911.0 | 1454.5 | 15892.5 | 2380.0 | 6659 | desktop | 384 | Gondal | Rolisica | 1.0 | 0.0 | 2.0 | 8.0 | 3.0 | 131.0 | 15200.0 | 3845.0 | 1939.0 | 1176.0 | 58646.0 | 9952.0 | 2198.0 | 24934.0 | 3911.0 | 67176 | desktop | 384 | Gondal | Rolisica | 1.0 | 0.0 | 1.0 | 8.0 | 2.0 | 176.0 | 15200.0 | 3845.0 | 1939.0 | 1329.5 | 63696.0 | 10780.5 | 2198.0 | 24934.0 | 3911.0 |
1433173 rows × 95 columns
%%time
# df_feat = df_feat.apply(pd.to_numeric, errors='ignore',downcast="float").apply(pd.to_numeric, errors='ignore',downcast="integer")
df_feat.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1433173 entries, 0 to 1433172 Data columns (total 95 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 utrip_id 1433173 non-null object 1 rank 1433173 non-null int16 2 label 1433173 non-null int8 3 user_id 1433173 non-null int32 4 checkin 1433173 non-null float32 5 checkout 1433173 non-null float32 6 city_id 1433173 non-null int32 7 device_class 1433173 non-null object 8 affiliate_id 1433173 non-null int16 9 booker_country 1433173 non-null object 10 hotel_country 1433173 non-null object 11 duration 1433173 non-null int8 12 same_country 1433173 non-null int8 13 checkin_day 1433173 non-null int8 14 checkin_weekday 1433173 non-null int8 15 checkin_week 1433173 non-null int8 16 checkin_month 1433173 non-null int8 17 checkin_year 1433173 non-null int8 18 checkin_day_of_year 1433173 non-null int16 19 checkin_quarter 1433173 non-null int8 20 checkout_weekday 1433173 non-null int8 21 checkout_week 1433173 non-null int8 22 checkout_day 1433173 non-null int8 23 checkin_month_sin 1433173 non-null float32 24 checkin_month_cos 1433173 non-null float32 25 checkin_week_sin 1433173 non-null float32 26 checkin_week_cos 1433173 non-null float32 27 row_num 1433173 non-null int8 28 total_rows 1433173 non-null int8 29 last 1433173 non-null int8 30 first_hotel_country 1433173 non-null object 31 first_city_id 1433173 non-null int32 32 first_duration 1433173 non-null int8 33 first_same_country 1433173 non-null int8 34 city_id_count 1433173 non-null int16 35 affiliate_id_count 1433173 non-null int32 36 hotel_country_count 1433173 non-null int16 37 checkin_month_count 1433173 non-null int32 38 checkin_week_count 1433173 non-null int16 39 city_id_nunique 1433173 non-null int8 40 hotel_country_nunique 1433173 non-null int8 41 affiliate_id_nunique 1433173 non-null int8 42 booker_country_nunique 1433173 non-null int8 43 city_id_rank_by_hotel_country 1433173 non-null float32 44 city_id_rank_by_booker_country 1433173 non-null float32 45 city_id_rank_by_affiliate 1433173 non-null float32 46 affiliate_id_rank_by_hotel_country 1433173 non-null float32 47 affiliate_id_rank_by_booker_country 1433173 non-null float32 48 affiliate_id_rank_by_affiliate 1433173 non-null float32 49 hotel_country_rank_by_hotel_country 1433173 non-null float32 50 hotel_country_rank_by_booker_country 1433173 non-null float32 51 hotel_country_rank_by_affiliate 1433173 non-null float32 52 checkin_month_rank_by_hotel_country 1433173 non-null float32 53 checkin_month_rank_by_booker_country 1433173 non-null float32 54 checkin_month_rank_by_affiliate 1433173 non-null float32 55 lag1_city_id 1433173 non-null int32 56 lag1_device_class 1433173 non-null object 57 lag1_affiliate_id 1433173 non-null int16 58 lag1_booker_country 1433173 non-null object 59 lag1_hotel_country 1433173 non-null object 60 lag1_duration 1433173 non-null float32 61 lag1_same_country 1433173 non-null float32 62 lag1_checkin_weekday 1433173 non-null float32 63 lag1_checkin_week 1433173 non-null float32 64 lag1_checkout_weekday 1433173 non-null float32 65 lag1_city_id_count 1433173 non-null float32 66 lag1_affiliate_id_count 1433173 non-null float32 67 lag1_hotel_country_count 1433173 non-null float32 68 lag1_checkin_week_count 1433173 non-null float32 69 lag1_city_id_rank_by_hotel_country 1433173 non-null float32 70 lag1_city_id_rank_by_booker_country 1433173 non-null float32 71 lag1_city_id_rank_by_affiliate 1433173 non-null float32 72 lag1_affiliate_id_rank_by_hotel_country 1433173 non-null float32 73 lag1_hotel_country_rank_by_booker_country 1433173 non-null float32 74 lag1_hotel_country_rank_by_affiliate 1433173 non-null float32 75 lag2_city_id 1433173 non-null int32 76 lag2_device_class 1433173 non-null object 77 lag2_affiliate_id 1433173 non-null int16 78 lag2_booker_country 1433173 non-null object 79 lag2_hotel_country 1433173 non-null object 80 lag2_duration 1433132 non-null float32 81 lag2_same_country 1433132 non-null float32 82 lag2_checkin_weekday 1433132 non-null float32 83 lag2_checkin_week 1433132 non-null float32 84 lag2_checkout_weekday 1433132 non-null float32 85 lag2_city_id_count 1433132 non-null float32 86 lag2_affiliate_id_count 1433132 non-null float32 87 lag2_hotel_country_count 1433132 non-null float32 88 lag2_checkin_week_count 1433132 non-null float32 89 lag2_city_id_rank_by_hotel_country 1433132 non-null float32 90 lag2_city_id_rank_by_booker_country 1433132 non-null float32 91 lag2_city_id_rank_by_affiliate 1433132 non-null float32 92 lag2_affiliate_id_rank_by_hotel_country 1433132 non-null float32 93 lag2_hotel_country_rank_by_booker_country 1433132 non-null float32 94 lag2_hotel_country_rank_by_affiliate 1433132 non-null float32 dtypes: float32(48), int16(8), int32(7), int8(21), object(11) memory usage: 482.5+ MB CPU times: user 1.14 s, sys: 6.36 ms, total: 1.15 s Wall time: 1.15 s
########################
## stratified train/test split by user or utrip_id
### split could maybe be by utrip ID ?
### orig - split by group :
train_inds, test_inds = next(GroupShuffleSplit(test_size=.2, n_splits=2, random_state = 7).split(df_feat, groups=df_feat['user_id']))
X_train = df_feat.iloc[train_inds].drop(DROP_FEATS,axis=1,errors="ignore").merge(df_city_features,on="city_id",how="left")
X_test = df_feat.iloc[test_inds].drop(DROP_FEATS,axis=1,errors="ignore").merge(df_city_features,on="city_id",how="left")
# df_feat.drop(['user_id'],axis=1,errors="ignore",inplace=True)
df_feat = df_feat.drop(DROP_FEATS,axis=1,errors="ignore")
df_feat = df_feat.merge(df_city_features,on="city_id",how="left")
df_feat.tail()
y_train = X_train.pop(TARGET_COL)
y_test = X_test.pop(TARGET_COL)
print("# classes",y_train.nunique())
# ## check that same classes in train and test -
# assert (set(y_train.unique()) == set(y_test.unique()))
assert y_train.isna().max() == y_test.isna().max() == 0
# classes 2
For now - simple multiclass model (Tabnet? LSTM?) ; +- subsample - only most frequent classes/cities
pip install pytorch-tabnet
split train/test by user id.
Try multiclass models
# from pytorch_tabnet.tab_model import TabNetClassifier
# from pytorch_tabnet.pretraining import TabNetPretrainer
# import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
np.random.seed(0)
cat_idxs : list of int (default=[] - Mandatory for embeddings)
cat_dims : list of int (default=[] - Mandatory for embeddings)
cat_emb_dim : list of int (optional)
All the categorical vals must be known from train (demo used label encoder). Consider doing so also here at late step, to avoid unknown vals ?
print(CAT_FEAT_NAMES)
['booker_country', 'device_class', 'affiliate_id', 'city_id', 'hotel_country', 'utrip_id', 'checkin_week', 'lag1_city_id', 'lag1_booker_country', 'lag1_hotel_country', 'lag1_affiliate_id', 'lag1_device_class', 'lag2_city_id', 'lag2_booker_country', 'lag2_hotel_country', 'lag2_affiliate_id', 'lag2_device_class', 'first_hotel_country', 'first_city_id']
NUMERIC_COLS = [item for item in list(df_feat.columns.drop(TARGET_COL)) if item not in CAT_FEAT_NAMES]
print(len(NUMERIC_COLS))
print("numeric cols",NUMERIC_COLS)
# for c in NUMERIC_COLS:
# l_enc = StandardScaler() # MinMaxScaler()#
# l_enc.fit(df_feat[c].values.reshape(-1,1))
# X_train[c] = l_enc.transform(X_train[c].values.reshape(-1,1))
# X_test[c] = l_enc.transform(X_test[c].values.reshape(-1,1))
# l_enc = StandardScaler() # MinMaxScaler()#
# l_enc.fit(df_feat[NUMERIC_COLS])
# X_train[NUMERIC_COLS] = l_enc.transform(X_train[NUMERIC_COLS])
# X_test[NUMERIC_COLS] = l_enc.transform(X_test[NUMERIC_COLS])
60 numeric cols ['rank', 'duration', 'checkin_day', 'checkin_weekday', 'checkin_month', 'checkin_year', 'checkin_day_of_year', 'checkin_quarter', 'checkout_weekday', 'checkout_week', 'checkout_day', 'checkin_month_sin', 'checkin_month_cos', 'checkin_week_sin', 'checkin_week_cos', 'row_num', 'total_rows', 'last', 'first_duration', 'first_same_country', 'affiliate_id_count', 'checkin_month_count', 'checkin_week_count', 'affiliate_id_nunique', 'booker_country_nunique', 'checkin_month_rank_by_booker_country', 'checkin_month_rank_by_affiliate', 'lag1_duration', 'lag1_same_country', 'lag1_checkin_weekday', 'lag1_checkin_week', 'lag1_checkout_weekday', 'lag1_city_id_count', 'lag1_affiliate_id_count', 'lag1_hotel_country_count', 'lag1_checkin_week_count', 'lag1_city_id_rank_by_hotel_country', 'lag1_city_id_rank_by_booker_country', 'lag1_city_id_rank_by_affiliate', 'lag1_affiliate_id_rank_by_hotel_country', 'lag1_hotel_country_rank_by_booker_country', 'lag1_hotel_country_rank_by_affiliate', 'lag2_duration', 'lag2_same_country', 'lag2_checkin_weekday', 'lag2_checkin_week', 'lag2_checkout_weekday', 'lag2_city_id_count', 'lag2_affiliate_id_count', 'lag2_hotel_country_count', 'lag2_checkin_week_count', 'lag2_city_id_rank_by_hotel_country', 'lag2_city_id_rank_by_booker_country', 'lag2_city_id_rank_by_affiliate', 'lag2_affiliate_id_rank_by_hotel_country', 'lag2_hotel_country_rank_by_booker_country', 'lag2_hotel_country_rank_by_affiliate', 'city_id_count', 'hotel_country_count', 'city_id_rank_by_hotel_country']
# OBJECT_COLS = list(df_feat.select_dtypes("O").columns)
# df_feat[OBJECT_COLS] = df_feat[OBJECT_COLS].fillna("")
X_train[OBJECT_COLS] = X_train[OBJECT_COLS].fillna("")
X_test[OBJECT_COLS] = X_test[OBJECT_COLS].fillna("")
# df_feat[CAT_FEAT_NAMES] = df_feat[CAT_FEAT_NAMES].fillna(-1)
X_train[CAT_FEAT_NAMES] = X_train[CAT_FEAT_NAMES].fillna(-1)
X_test[CAT_FEAT_NAMES] = X_test[CAT_FEAT_NAMES].fillna(-1)
Ordinal/categoircals encoder
#### for now - cheat and fit on df_feat = all possible categoircals. In test, need to check that there won't be unk values +- update sklearn version
# for c in CAT_FEAT_NAMES:
# l_enc = LabelEncoder().fit(df_feat[c])
# X_train[c] = l_enc.transform(X_train[c])
# X_test[c] = l_enc.transform(X_test[c])
#### https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html
# # cat_encoder = OrdinalEncoder(handle_unknown="use_encoded_value",unknown_value="")
# cat_encoder = OrdinalEncoder()
# # X_train[CAT_FEAT_NAMES] = cat_encoder.fit_transform(X_train[CAT_FEAT_NAMES])
# cat_encoder.fit(df_feat[CAT_FEAT_NAMES])
# X_train[CAT_FEAT_NAMES] = cat_encoder.transform(X_train[CAT_FEAT_NAMES])
# X_test[CAT_FEAT_NAMES] = cat_encoder.transform(X_test[CAT_FEAT_NAMES])
# X_train.columns.get_loc(CAT_FEAT_NAMES)
cat_idxs = [X_train.columns.get_loc(c) for c in CAT_FEAT_NAMES if c in X_train]
assert len(cat_idxs) == len(CAT_FEAT_NAMES)
print(cat_idxs)
[5, 3, 4, 2, 75, 0, 9, 35, 38, 39, 37, 36, 55, 58, 59, 57, 56, 24, 25]
# #### get nuniques and set embeding dimension per categorical
# ### note that we need to change here if we want a higher embedding dimension!
# nunique = X_train.nunique()
# types = X_train.dtypes
# # categorical_columns = []
# categorical_dims = [] #{}
# cat_embed_dims = []
# for i,col in enumerate(cat_idxs):
# # print(i,col)
# # c_uniques = X_train.iloc[:,col].nunique()
# c_uniques = df_feat[CAT_FEAT_NAMES[i]].nunique() ## try to use original data, more nuniques?
# categorical_dims.append(c_uniques)
# # if col == "user_id" : cat_embed_dims.append(10) ## need to change to use names. user id may overfit
# cat_embed_dims.append(min(100,c_uniques//2))
assert X_test[CAT_FEAT_NAMES].isna().sum().max() == X_train[CAT_FEAT_NAMES].isna().sum().max() == 0
print("sum top4 total percentage:",y_train.value_counts(normalize=True)[0:4].sum().round(3))
y_train.value_counts(normalize=True).round(5)
sum top4 total percentage: 1.0
0 0.97705 1 0.02295 Name: label, dtype: float64
if RUN_TABNET:
# TabNetPretrainer
unsupervised_model = TabNetPretrainer(
n_d=32, n_a=32, n_steps=4,
cat_idxs=cat_idxs,
cat_dims=categorical_dims,
cat_emb_dim=cat_embed_dims,
optimizer_fn=torch.optim.Adam,
optimizer_params=dict(lr=2e-2),
mask_type='entmax', # "sparsemax"
device_name="auto" #"auto" "cpu"
)
unsupervised_model.fit(
X_train=X_train.values,
# eval_set=[X_test.values],
pretraining_ratio=0.35,
max_epochs=6,
batch_size = 512 ,# 1024 default , ~256-512 with GPU
)
## save unsup model
### https://github.com/dreamquark-ai/tabnet/blob/develop/pretraining_example.ipynb
# unsupervised_model.save_model('./.4_pretrain')
from __future__ import print_function, absolute_import
# from pytorch_tabnet.metrics import Metric
# from sklearn.metrics import top_k_accuracy_score
__all__ = ['accuracy']
def accuracy_k(output, target, topk=(4,)): # (1,))
"""Computes the precision@k for the specified values of k"""
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0)
res.append(correct_k.mul_(100.0 / batch_size))
return res
if RUN_TABNET:
clf = TabNetClassifier(
n_d=32, n_a=32, n_steps=4,
cat_idxs=cat_idxs,
cat_dims=categorical_dims,
cat_emb_dim=cat_embed_dims,
optimizer_fn=torch.optim.Adam,
optimizer_params=dict(lr=2e-2),
scheduler_params={"step_size":50, # how to use learning rate scheduler
"gamma":0.9},
scheduler_fn=torch.optim.lr_scheduler.StepLR,
# mask_type='entmax', # "sparsemax"
device_name="auto" #"auto" "cpu"
)
clf.fit(
X_train=X_train.values, y_train=y_train.values,
eval_set=[(X_train.values, y_train.values), (X_test.values, y_test.values)],
# eval_set=[(X_test.values, y_test.values)],
eval_name=['train','test'],
eval_metric=['accuracy'], # 'accuracy',
max_epochs=max_epochs,
batch_size = 512 ,# 1024 default , ~256-512 with GPU
from_unsupervised=unsupervised_model,
)
# clf.save_model('./.full_tabnet_1192class')
if RUN_TABNET:
## top features (unsorted) - booker country
# X_train.columns[clf.feature_importances_>1e-7]
feat_imp = pd.DataFrame([X_train.columns,clf.feature_importances_]).T
feat_imp = feat_imp.loc[feat_imp[1]>0].sort_values(1,ascending=False).reset_index(drop=True)
feat_imp
# print(feat_imp[0].values)
# ['lag2_city_id' 'lag1_booker_country' 'lag2_booker_country'
# 'lag2_hotel_country' 'lag1_city_id' 'first_hotel_country'
# 'lag1_device_class' 'device_class' 'lag2_device_class' 'booker_country'
# 'lag1_hotel_country_rank_by_booker_country' 'checkin_week'
# 'lag1_city_id_count' 'checkin_quarter'
# 'lag2_booker_country_rank_by_booker_country' 'lag1_affiliate_id_count'
# 'lag1_checkin_month_rank_by_affiliate'
# 'lag2_checkin_month_rank_by_hotel_country' 'checkin_month_sin'
# 'lag2_hotel_country_count' 'lag2_checkin_month_count'
# 'lag2_affiliate_id_rank_by_booker_country' 'lag2_city_id_nunique']
+- add embeddings from lightfm, more features etc'
Catboost has ranking models. And supports embedding features. And taking time into account.
X_train[CAT_FEAT_NAMES] = X_train[CAT_FEAT_NAMES].apply(pd.to_numeric, errors='ignore',downcast="integer")
X_test[CAT_FEAT_NAMES] = X_test[CAT_FEAT_NAMES].apply(pd.to_numeric, errors='ignore',downcast="integer")
X_train[CAT_FEAT_NAMES].dtypes
booker_country object device_class object affiliate_id int16 city_id int32 hotel_country object utrip_id object checkin_week int8 lag1_city_id int32 lag1_booker_country object lag1_hotel_country object lag1_affiliate_id int16 lag1_device_class object lag2_city_id int32 lag2_booker_country object lag2_hotel_country object lag2_affiliate_id int16 lag2_device_class object first_hotel_country object first_city_id int32 dtype: object
print("train rows",X_train.shape[0])
print("test rows",X_test.shape[0])
train rows 1147004 test rows 286169
del df_feat
gc.collect()
197
def top4_accuracy(model):
"""warning - very hacky! - set to use global X_test, y_test variables"""
y_test_pred = model.predict(X_test,prediction_type="Probability")[:,1]
test_preds = pd.DataFrame({"y_test_pred":y_test_pred,
"candidate_city_id":X_test["city_id"],
"utrip_id":X_test["utrip_id"],
"label":y_test
})
test_preds.sort_values(["utrip_id","y_test_pred"],inplace=True,ascending=False)
res = 100*test_preds.groupby("utrip_id").head(4)["label"].sum() / test_preds["utrip_id"].nunique()
print("Top 4 Accuracy: {0:.4f}".format(res))
return res
%%time
train_pool = Pool(data=X_train,label = y_train,cat_features=CAT_FEAT_NAMES
,group_id=X_train["utrip_id"]
)
test_pool = Pool(data=X_test,label = y_test,cat_features=CAT_FEAT_NAMES
,group_id=X_test["utrip_id"]
)
CPU times: user 15.4 s, sys: 107 ms, total: 15.5 s Wall time: 14.8 s
# https://colab.research.google.com/github/catboost/tutorials/blob/master/ranking/ranking_tutorial.ipynb#scrollTo=PC-BWnUscjHB
default_parameters = {
'iterations': max_epochs,
'custom_metric': ['AUC:type=Ranking', 'RecallAt:top=4'],
'verbose': False,
'random_seed': 0,
"metric_period":50,
"task_type":"GPU",
# 'loss_function': "PairLogit", # "PairLogitPairwise"
# "ignored_features":["utrip_id"],
}
parameters = {}
parameters = deepcopy(default_parameters)
### parameter grid for grid search / Hyperparameter tuning:
hyper_params = {'depth':[3,6,8,10],
# 'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3],
'l2_leaf_reg':[3,1,5,10],
# 'border_count':[32,10,50,100,200], ## we may want to set city ID as golden features
# 'ctr_border_count':[50,5,10,20,100,200],
# 'loss_function':["PairLogit", "PairLogitPairwise","YetiRank"], # not supported in grid search :()
}
def fit_cbr_model(loss_function, additional_params=None, train_pool=train_pool, test_pool=test_pool,CV=True):
"""
Fit catboost ranking model, or CV (cross validate) and return scores ; with a given loss function
https://colab.research.google.com/github/catboost/tutorials/blob/master/ranking/ranking_tutorial.ipynb#scrollTo=RVNW0nowbtxH
>>>model = fit_model('PairLogit', {'custom_metric': ['PrecisionAt:top=10', 'RecallAt:top=4']})
"""
parameters = deepcopy(default_parameters)
parameters['loss_function'] = loss_function
parameters['train_dir'] = loss_function
if additional_params is not None:
parameters.update(additional_params)
if CV:
scores = cv(train_pool,params=parameters, plot=True)
# print(scores)
## train model to get hacky top4 accuracy
model = CatBoost(parameters)
model.fit(train_pool, eval_set=test_pool, plot=False)
print()
top4_acc = top4_accuracy(model)
return scores
else:
model = CatBoost(parameters)
model.fit(train_pool, eval_set=test_pool, plot=True)
return model
%%time
fit_cbr_model("PairLogit")
# Top 4 Accuracy: 58.1335
# test-RecallAt:top=4-mean: 0.64
# train-RecallAt:top=4-mean: 0.65
# CPU times: user 7min 30s, sys: 2min 22s, total: 9min 53s
# Wall time: 7min 29s
Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Top 4 Accuracy: 58.1335 CPU times: user 7min 30s, sys: 2min 22s, total: 9min 53s Wall time: 7min 29s
| iterations | test-PairLogit-mean | test-PairLogit-std | train-PairLogit-mean | train-PairLogit-std | test-AUC:type=Ranking-mean | test-AUC:type=Ranking-std | test-RecallAt:top=4-mean | test-RecallAt:top=4-std | train-RecallAt:top=4-mean | train-RecallAt:top=4-std | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.681205 | 0.004114 | 0.678151 | 0.000140 | 0.720692 | 0.002501 | 0.057062 | 0.001778 | 0.057062 | 0.000889 |
| 1 | 50 | 0.465969 | 0.002092 | 0.382545 | 0.001941 | 0.819280 | 0.002471 | 0.521653 | 0.002518 | 0.607873 | 0.002464 |
| 2 | 100 | 0.388649 | 0.011086 | 0.326751 | 0.001990 | 0.841248 | 0.007854 | 0.521653 | 0.002518 | 0.607873 | 0.002464 |
| 3 | 150 | 0.335585 | 0.010523 | 0.303870 | 0.002287 | 0.864322 | 0.004224 | 0.596948 | 0.000727 | 0.610184 | 0.002322 |
| 4 | 200 | 0.317865 | 0.009503 | 0.291993 | 0.003045 | 0.871461 | 0.004538 | 0.609485 | 0.003935 | 0.619443 | 0.004341 |
| 5 | 250 | 0.307280 | 0.006079 | 0.285274 | 0.002211 | 0.876027 | 0.003013 | 0.617330 | 0.002443 | 0.627736 | 0.002541 |
| 6 | 300 | 0.300430 | 0.004378 | 0.280186 | 0.001683 | 0.878610 | 0.001718 | 0.622989 | 0.002895 | 0.631712 | 0.001813 |
| 7 | 350 | 0.295912 | 0.004156 | 0.276058 | 0.001338 | 0.880201 | 0.001148 | 0.626679 | 0.003438 | 0.635563 | 0.001052 |
| 8 | 400 | 0.292430 | 0.003525 | 0.272472 | 0.001059 | 0.881371 | 0.000804 | 0.629544 | 0.005507 | 0.638930 | 0.000600 |
| 9 | 450 | 0.290317 | 0.003741 | 0.270159 | 0.001315 | 0.882047 | 0.001155 | 0.631980 | 0.004276 | 0.640971 | 0.001133 |
| 10 | 500 | 0.288531 | 0.004021 | 0.268092 | 0.001652 | 0.882535 | 0.001074 | 0.633485 | 0.004146 | 0.643031 | 0.001744 |
| 11 | 550 | 0.286940 | 0.004171 | 0.266140 | 0.001780 | 0.882862 | 0.001211 | 0.635777 | 0.003999 | 0.644840 | 0.000800 |
| 12 | 600 | 0.285304 | 0.003984 | 0.264036 | 0.001635 | 0.883324 | 0.001364 | 0.637461 | 0.004474 | 0.646380 | 0.001985 |
| 13 | 650 | 0.283920 | 0.004120 | 0.262415 | 0.001911 | 0.883605 | 0.001528 | 0.638858 | 0.003963 | 0.648100 | 0.001933 |
| 14 | 700 | 0.282703 | 0.004024 | 0.260989 | 0.001797 | 0.883822 | 0.001546 | 0.640721 | 0.004192 | 0.649497 | 0.002253 |
| 15 | 750 | 0.281548 | 0.003952 | 0.259555 | 0.001718 | 0.884110 | 0.001491 | 0.641795 | 0.005182 | 0.650750 | 0.002669 |
| 16 | 799 | 0.280603 | 0.004217 | 0.258362 | 0.001854 | 0.884231 | 0.001518 | 0.642118 | 0.004289 | 0.652004 | 0.003306 |
%%time
fit_cbr_model("PairLogitPairwise")
# Top 4 Accuracy: 54.5441
# test-RecallAt:top=4-mean: 0.55
# train-RecallAt:top=4-mean: 0.65
# CPU times: user 3min 24s, sys: 2min 36s, total: 6min 1s
# Wall time: 3min 16s
Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Top 4 Accuracy: 54.5441 CPU times: user 3min 24s, sys: 2min 36s, total: 6min 1s Wall time: 3min 16s
| iterations | test-PairLogit-mean | test-PairLogit-std | train-PairLogit-mean | train-PairLogit-std | test-AUC:type=Ranking-mean | test-AUC:type=Ranking-std | test-RecallAt:top=4-mean | test-RecallAt:top=4-std | train-RecallAt:top=4-mean | train-RecallAt:top=4-std | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.680401 | 0.000065 | 0.676929 | 0.000024 | 0.794460 | 0.004566 | 0.316085 | 0.226091 | 0.404429 | 0.225771 |
| 1 | 50 | 0.423635 | 0.001979 | 0.364262 | 0.000463 | 0.837174 | 0.004820 | 0.480782 | 0.001135 | 0.568202 | 0.002830 |
| 2 | 100 | 0.369338 | 0.004405 | 0.302977 | 0.000498 | 0.846878 | 0.003592 | 0.530643 | 0.029789 | 0.588405 | 0.001644 |
| 3 | 150 | 0.357422 | 0.003370 | 0.283101 | 0.000399 | 0.848349 | 0.002638 | 0.531360 | 0.014507 | 0.595605 | 0.003097 |
| 4 | 200 | 0.354596 | 0.003356 | 0.274244 | 0.000326 | 0.848000 | 0.002595 | 0.535516 | 0.004581 | 0.603038 | 0.002386 |
| 5 | 250 | 0.352383 | 0.005074 | 0.267531 | 0.000114 | 0.848128 | 0.002922 | 0.535731 | 0.007029 | 0.608590 | 0.001649 |
| 6 | 300 | 0.353311 | 0.006080 | 0.262842 | 0.000456 | 0.847049 | 0.002974 | 0.538095 | 0.005736 | 0.616399 | 0.003709 |
| 7 | 350 | 0.354928 | 0.006837 | 0.258672 | 0.000365 | 0.845744 | 0.002768 | 0.539420 | 0.004732 | 0.621163 | 0.003347 |
| 8 | 400 | 0.356264 | 0.006767 | 0.254926 | 0.000614 | 0.845078 | 0.002343 | 0.540996 | 0.004206 | 0.625640 | 0.003207 |
| 9 | 450 | 0.356053 | 0.005982 | 0.250997 | 0.001101 | 0.845008 | 0.001749 | 0.544113 | 0.002133 | 0.630386 | 0.003121 |
| 10 | 500 | 0.355091 | 0.005590 | 0.247249 | 0.001120 | 0.845233 | 0.001369 | 0.546620 | 0.001893 | 0.635867 | 0.003181 |
| 11 | 550 | 0.354878 | 0.006475 | 0.243865 | 0.001291 | 0.845351 | 0.001741 | 0.549199 | 0.003403 | 0.640416 | 0.003741 |
| 12 | 600 | 0.353845 | 0.006418 | 0.240640 | 0.001313 | 0.845586 | 0.001530 | 0.550990 | 0.003311 | 0.644195 | 0.003639 |
| 13 | 650 | 0.353275 | 0.006732 | 0.237766 | 0.001218 | 0.845666 | 0.001754 | 0.553748 | 0.004865 | 0.648010 | 0.002431 |
| 14 | 700 | 0.353930 | 0.007072 | 0.235227 | 0.001184 | 0.845203 | 0.002002 | 0.554644 | 0.005564 | 0.651270 | 0.001337 |
| 15 | 750 | 0.354666 | 0.008130 | 0.232769 | 0.001008 | 0.844839 | 0.002233 | 0.556148 | 0.005321 | 0.654422 | 0.001099 |
| 16 | 799 | 0.355069 | 0.008125 | 0.230471 | 0.000984 | 0.844646 | 0.002017 | 0.557008 | 0.005391 | 0.657037 | 0.001653 |
%%time
fit_cbr_model("YetiRank") ## YetiRankPairwise - may be better, but slow
# Top 4 Accuracy: 53.4530
# CPU times: user 9min 26s, sys: 3min 52s, total: 13min 19s
# Wall time: 10min 42s
# test-RecallAt:top=4-mean: 0.63
# train-RecallAt:top=4-mean: .63
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Top 4 Accuracy: 53.4530 CPU times: user 9min 26s, sys: 3min 52s, total: 13min 19s Wall time: 10min 42s
| iterations | test-PFound-mean | test-PFound-std | test-AUC:type=Ranking-mean | test-AUC:type=Ranking-std | test-RecallAt:top=4-mean | test-RecallAt:top=4-std | train-RecallAt:top=4-mean | train-RecallAt:top=4-std | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.223576 | 0.013779 | 0.720692 | 0.002501 | 0.057062 | 0.001778 | 0.057062 | 0.000889 |
| 1 | 50 | 0.350355 | 0.002912 | 0.749169 | 0.001301 | 0.521653 | 0.002518 | 0.521653 | 0.001259 |
| 2 | 100 | 0.417470 | 0.001840 | 0.799094 | 0.003563 | 0.521653 | 0.002518 | 0.607873 | 0.002464 |
| 3 | 150 | 0.458801 | 0.001900 | 0.826364 | 0.002089 | 0.521653 | 0.002518 | 0.607873 | 0.002464 |
| 4 | 200 | 0.459410 | 0.001707 | 0.834172 | 0.002473 | 0.521653 | 0.002518 | 0.607873 | 0.002464 |
| 5 | 250 | 0.509542 | 0.004881 | 0.837065 | 0.003499 | 0.521653 | 0.002518 | 0.607873 | 0.002464 |
| 6 | 300 | 0.514873 | 0.002383 | 0.839832 | 0.003915 | 0.526167 | 0.009936 | 0.611635 | 0.005099 |
| 7 | 350 | 0.520538 | 0.002408 | 0.842008 | 0.003664 | 0.536089 | 0.004459 | 0.617187 | 0.001513 |
| 8 | 400 | 0.526907 | 0.001659 | 0.844383 | 0.003254 | 0.536841 | 0.003255 | 0.619766 | 0.002287 |
| 9 | 450 | 0.533969 | 0.002122 | 0.846946 | 0.003815 | 0.538990 | 0.004811 | 0.620733 | 0.001680 |
| 10 | 500 | 0.543509 | 0.004304 | 0.850466 | 0.003970 | 0.541820 | 0.005914 | 0.621700 | 0.001100 |
| 11 | 550 | 0.552867 | 0.005322 | 0.854414 | 0.003327 | 0.553642 | 0.011482 | 0.623742 | 0.002237 |
| 12 | 600 | 0.565864 | 0.009832 | 0.857365 | 0.002874 | 0.585343 | 0.027094 | 0.627772 | 0.002916 |
| 13 | 650 | 0.578655 | 0.004536 | 0.860494 | 0.002519 | 0.629580 | 0.005090 | 0.628703 | 0.003992 |
| 14 | 700 | 0.590608 | 0.004512 | 0.862752 | 0.002551 | 0.635025 | 0.006873 | 0.633055 | 0.001620 |
| 15 | 750 | 0.601615 | 0.004151 | 0.864918 | 0.001875 | 0.637497 | 0.005984 | 0.635025 | 0.002399 |
| 16 | 799 | 0.603385 | 0.004600 | 0.866040 | 0.002037 | 0.638643 | 0.006159 | 0.635509 | 0.002984 |
%%time
fit_cbr_model("QuerySoftMax") ## may be slow ?
# Top 4 Accuracy: 67.2362
# test-RecallAt:top=4-mean : 0.70
# train-RecallAt:top=4-mean : 0.68
# CPU times: user 24min 39s, sys: 6min 13s, total: 30min 53s
# Wall time: 25min 15s
Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Top 4 Accuracy: 67.2362 CPU times: user 24min 39s, sys: 6min 13s, total: 30min 53s Wall time: 25min 15s
| iterations | test-QuerySoftMax-mean | test-QuerySoftMax-std | train-QuerySoftMax-mean | train-QuerySoftMax-std | test-AUC:type=Ranking-mean | test-AUC:type=Ranking-std | test-RecallAt:top=4-mean | test-RecallAt:top=4-std | train-RecallAt:top=4-mean | train-RecallAt:top=4-std | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3.693732 | 0.000551 | 3.693735 | 0.000276 | 0.545718 | 0.001266 | 0.143282 | 0.002229 | 0.143282 | 0.001114 |
| 1 | 50 | 2.689877 | 0.024840 | 2.703698 | 0.008108 | 0.838276 | 0.012595 | 0.607909 | 0.004972 | 0.609611 | 0.002174 |
| 2 | 100 | 2.316377 | 0.008981 | 2.333944 | 0.012313 | 0.876832 | 0.002328 | 0.609055 | 0.006529 | 0.609611 | 0.002174 |
| 3 | 150 | 2.231935 | 0.011517 | 2.248327 | 0.009521 | 0.878701 | 0.002359 | 0.614679 | 0.005144 | 0.611616 | 0.004883 |
| 4 | 200 | 2.171050 | 0.022833 | 2.190092 | 0.006588 | 0.882582 | 0.002747 | 0.638285 | 0.017039 | 0.631175 | 0.007361 |
| 5 | 250 | 2.083975 | 0.022727 | 2.114509 | 0.004244 | 0.886435 | 0.002135 | 0.683204 | 0.007742 | 0.667801 | 0.001439 |
| 6 | 300 | 2.040074 | 0.009247 | 2.075267 | 0.010008 | 0.888063 | 0.001643 | 0.691944 | 0.002482 | 0.674266 | 0.004153 |
| 7 | 350 | 2.018933 | 0.003273 | 2.055512 | 0.019933 | 0.888871 | 0.001918 | 0.693735 | 0.004086 | 0.676237 | 0.004713 |
| 8 | 400 | 2.003402 | 0.015768 | 2.041017 | 0.029119 | 0.889399 | 0.001878 | 0.695634 | 0.003779 | 0.677974 | 0.005619 |
| 9 | 450 | 1.992696 | 0.022726 | 2.030667 | 0.034245 | 0.889969 | 0.001460 | 0.698069 | 0.002351 | 0.679371 | 0.005823 |
| 10 | 500 | 1.984110 | 0.025091 | 2.022182 | 0.036194 | 0.890473 | 0.001231 | 0.699896 | 0.001337 | 0.681520 | 0.005489 |
| 11 | 550 | 1.979045 | 0.026804 | 2.016906 | 0.037716 | 0.890723 | 0.001131 | 0.701222 | 0.001944 | 0.682004 | 0.005367 |
| 12 | 600 | 1.972345 | 0.025731 | 2.009913 | 0.038064 | 0.891079 | 0.001119 | 0.702762 | 0.003417 | 0.683938 | 0.006933 |
| 13 | 650 | 1.966947 | 0.026847 | 2.004649 | 0.039982 | 0.891416 | 0.001286 | 0.703765 | 0.002947 | 0.684475 | 0.006545 |
| 14 | 700 | 1.962582 | 0.025154 | 2.000284 | 0.039488 | 0.891529 | 0.001259 | 0.704624 | 0.002429 | 0.685550 | 0.006861 |
| 15 | 750 | 1.957841 | 0.026703 | 1.995652 | 0.041653 | 0.891699 | 0.001473 | 0.705807 | 0.003142 | 0.686750 | 0.007898 |
| 16 | 799 | 1.951281 | 0.024621 | 1.989186 | 0.040471 | 0.892059 | 0.001484 | 0.707204 | 0.003745 | 0.688004 | 0.007347 |
%%time
fit_cbr_model("YetiRankPairwise")
# Top 4 Accuracy: 56.0373
# test-RecallAt:top=4-mean: 0.57
# train-RecallAt:top=4-mean: 0.65
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time Metric RecallAt:top=4 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Top 4 Accuracy: 56.0373
| iterations | test-PFound-mean | test-PFound-std | test-AUC:type=Ranking-mean | test-AUC:type=Ranking-std | test-RecallAt:top=4-mean | test-RecallAt:top=4-std | train-RecallAt:top=4-mean | train-RecallAt:top=4-std | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.322735 | 0.079769 | 0.796052 | 0.001121 | 0.319631 | 0.226649 | 0.404464 | 0.227316 |
| 1 | 50 | 0.530950 | 0.022567 | 0.853675 | 0.002139 | 0.538596 | 0.004606 | 0.571050 | 0.003446 |
| 2 | 100 | 0.551608 | 0.002142 | 0.858991 | 0.001087 | 0.556184 | 0.003797 | 0.589372 | 0.001936 |
| 3 | 150 | 0.555847 | 0.001970 | 0.858444 | 0.000873 | 0.564566 | 0.003625 | 0.599312 | 0.001440 |
| 4 | 200 | 0.557107 | 0.001943 | 0.858230 | 0.001119 | 0.568327 | 0.004219 | 0.607855 | 0.001478 |
| 5 | 250 | 0.558915 | 0.004403 | 0.858930 | 0.001620 | 0.570691 | 0.006173 | 0.614357 | 0.002168 |
| 6 | 300 | 0.559137 | 0.005160 | 0.859215 | 0.001203 | 0.571085 | 0.008967 | 0.620196 | 0.000763 |
| 7 | 350 | 0.558567 | 0.004719 | 0.859296 | 0.001373 | 0.571085 | 0.008290 | 0.625139 | 0.002134 |
| 8 | 400 | 0.557470 | 0.006132 | 0.859406 | 0.001258 | 0.571085 | 0.010423 | 0.629169 | 0.001185 |
| 9 | 450 | 0.556743 | 0.005637 | 0.859513 | 0.001224 | 0.569151 | 0.010120 | 0.632858 | 0.000930 |
| 10 | 500 | 0.556882 | 0.004991 | 0.859542 | 0.000937 | 0.569008 | 0.007942 | 0.636888 | 0.001632 |
| 11 | 550 | 0.557385 | 0.004375 | 0.859708 | 0.000983 | 0.571300 | 0.008869 | 0.640595 | 0.002290 |
| 12 | 600 | 0.557556 | 0.004837 | 0.859628 | 0.001092 | 0.571014 | 0.008739 | 0.643748 | 0.002718 |
| 13 | 650 | 0.558013 | 0.005858 | 0.859724 | 0.001094 | 0.572232 | 0.009858 | 0.647312 | 0.002747 |
| 14 | 700 | 0.557857 | 0.006989 | 0.859636 | 0.001167 | 0.572303 | 0.010954 | 0.649783 | 0.003198 |
| 15 | 750 | 0.557549 | 0.006076 | 0.859405 | 0.000902 | 0.571838 | 0.008846 | 0.652452 | 0.002758 |
| 16 | 799 | 0.558028 | 0.005928 | 0.859361 | 0.000877 | 0.573127 | 0.009003 | 0.655228 | 0.002355 |
# fit_cbr_model("Logloss") ##error - Stratified split is incompatible with groupwise metrics - rerun as classification (without group_ids)
# model = CatBoost(parameters)
# # model = CatBoost(hyper_params)
# grid_search_result = model.grid_search(hyper_params, train_pool, verbose=0)
# print(grid_search_result)
# print(grid_search_result['params'])
# {'border_count': 32, 'depth': 6, 'l2_leaf_reg': 3}
%%time
### Normal classifier (not ranking) :
model = CatBoostClassifier(verbose=False,iterations=max_epochs, eval_metric='AUC', task_type="GPU",
metric_period=30,
ignored_features=["utrip_id"])
model.fit(train_pool, eval_set=test_pool, plot=True,silent=False,use_best_model=True,
)
print(model.get_best_score())
print()
top4_acc = top4_accuracy(model)
# bestTest = 0.9061549306
# bestIteration = 799
# {'learn': {'Logloss': 0.04975058936150179, 'AUC': 0.9652304649353027}, 'validation': {'Logloss': 0.06744807045836551, 'AUC': 0.9061549305915833}}
# Top 4 Accuracy: 64.6231
# CPU times: user 2min 19s, sys: 40.7 s, total: 3min
# Wall time: 2min 17s
Learning rate set to 0.042912
0: learn: 0.6111331 test: 0.5127393 best: 0.5127393 (0) total: 236ms remaining: 3m 8s
30: learn: 0.6614749 test: 0.5729508 best: 0.5729508 (30) total: 7.43s remaining: 3m 4s
60: learn: 0.8603909 test: 0.7937304 best: 0.7937304 (60) total: 14.7s remaining: 2m 57s
90: learn: 0.9202251 test: 0.8601156 best: 0.8601156 (90) total: 22.2s remaining: 2m 53s
120: learn: 0.9359181 test: 0.8807634 best: 0.8807634 (120) total: 30.1s remaining: 2m 49s
150: learn: 0.9462079 test: 0.8847522 best: 0.8847522 (150) total: 37.9s remaining: 2m 42s
180: learn: 0.9513701 test: 0.8820381 best: 0.8847522 (150) total: 45.9s remaining: 2m 37s
210: learn: 0.9541184 test: 0.8832067 best: 0.8847522 (150) total: 54.3s remaining: 2m 31s
240: learn: 0.9562531 test: 0.8861905 best: 0.8861905 (240) total: 1m 2s remaining: 2m 25s
270: learn: 0.9575975 test: 0.8886899 best: 0.8886899 (270) total: 1m 11s remaining: 2m 18s
300: learn: 0.9587052 test: 0.8894615 best: 0.8894615 (300) total: 1m 19s remaining: 2m 12s
330: learn: 0.9594169 test: 0.8905969 best: 0.8905969 (330) total: 1m 28s remaining: 2m 4s
360: learn: 0.9601082 test: 0.8902917 best: 0.8905969 (330) total: 1m 36s remaining: 1m 57s
390: learn: 0.9606660 test: 0.8917980 best: 0.8917980 (390) total: 1m 44s remaining: 1m 49s
420: learn: 0.9612854 test: 0.8932259 best: 0.8932259 (420) total: 1m 52s remaining: 1m 41s
450: learn: 0.9617792 test: 0.8942041 best: 0.8942041 (450) total: 2m 1s remaining: 1m 33s
480: learn: 0.9621140 test: 0.8951214 best: 0.8951214 (480) total: 2m 9s remaining: 1m 26s
510: learn: 0.9624584 test: 0.8961748 best: 0.8961748 (510) total: 2m 17s remaining: 1m 17s
540: learn: 0.9628856 test: 0.8974140 best: 0.8974140 (540) total: 2m 26s remaining: 1m 10s
570: learn: 0.9632958 test: 0.8980094 best: 0.8980094 (570) total: 2m 34s remaining: 1m 2s
600: learn: 0.9635974 test: 0.8985456 best: 0.8985456 (600) total: 2m 43s remaining: 54s
630: learn: 0.9638964 test: 0.8991992 best: 0.8991992 (630) total: 2m 51s remaining: 45.9s
660: learn: 0.9641797 test: 0.9000098 best: 0.9000098 (660) total: 2m 59s remaining: 37.8s
690: learn: 0.9644498 test: 0.9007351 best: 0.9007351 (690) total: 3m 7s remaining: 29.6s
720: learn: 0.9647033 test: 0.9010640 best: 0.9010640 (720) total: 3m 15s remaining: 21.5s
750: learn: 0.9648905 test: 0.9015392 best: 0.9015392 (750) total: 3m 24s remaining: 13.3s
780: learn: 0.9651009 test: 0.9019842 best: 0.9019842 (780) total: 3m 32s remaining: 5.17s
799: learn: 0.9652330 test: 0.9021776 best: 0.9021776 (799) total: 3m 37s remaining: 0us
bestTest = 0.9021776319
bestIteration = 799
{'learn': {'Logloss': 0.04978118537511639, 'AUC': 0.9652329683303833}, 'validation': {'Logloss': 0.06803899913600006, 'AUC': 0.902177631855011}}
Top 4 Accuracy: 64.2642
CPU times: user 3min 27s, sys: 1min 30s, total: 4min 57s
Wall time: 4min 10s
# if FASTRUN:
# iters = 100
# parameters["iterations"] = 200
# else:
# iters = 3000
# # model = CatBoostClassifier(verbose=False,iterations=iters, eval_metric='AUC', task_type="GPU",
# # metric_period=40,
# # ignored_features=["utrip_id"])#,learning_rate=0.1,
# # model = CatBoostClassifier(verbose=False,iterations=iters, eval_metric='AUC:type=Ranking',# task_type="GPU",
# # metric_period=40,
# # loss_function="PairLogitPairwise", ##"PairLogitPairwise" "YetiRank" "PairLogitPairwise"
# # ignored_features=["utrip_id"])
# model.fit(train_pool, eval_set=test_pool, plot=True,silent=False,use_best_model=True,
# # save_snapshot=True
# )
# print(model.get_best_score())
# print()
# top4_acc = top4_accuracy(model)
1.2k epochs, default settings, 35% of ranked data:
* {'learn': {'Logloss': 0.0585, 'AUC': 0.959}
* 'validation': {'Logloss': 0.07554, 'AUC': 0.9168}}
%%time
explainer = shap.TreeExplainer(model)
sample_pool = Pool(data=X_test.head(100123),label = y_test.head(100123),cat_features=CAT_FEAT_NAMES)
shap_values = explainer.shap_values(test_pool)
# summarize the effects of all the features over all the data
shap.summary_plot(shap_values, X_test.head(100123))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-745caa3ef906> in <module>() ----> 1 get_ipython().run_cell_magic('time', '', 'explainer = shap.TreeExplainer(model)\nsample_pool = Pool(data=X_test.head(100123),label = y_test.head(100123),cat_features=CAT_FEAT_NAMES)\nshap_values = explainer.shap_values(test_pool)\n# summarize the effects of all the features over all the data\nshap.summary_plot(shap_values, X_test.head(100123))') /usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell) 2115 magic_arg_s = self.var_expand(line, stack_depth) 2116 with self.builtin_trap: -> 2117 result = fn(magic_arg_s, cell) 2118 return result 2119 <decorator-gen-60> in time(self, line, cell, local_ns) /usr/local/lib/python3.6/dist-packages/IPython/core/magic.py in <lambda>(f, *a, **k) 186 # but it's overkill for just that one bit of state. 187 def magic_deco(arg): --> 188 call = lambda f, *a, **k: f(*a, **k) 189 190 if callable(arg): /usr/local/lib/python3.6/dist-packages/IPython/core/magics/execution.py in time(self, line, cell, local_ns) 1191 else: 1192 st = clock2() -> 1193 exec(code, glob, local_ns) 1194 end = clock2() 1195 out = None <timed exec> in <module>() NameError: name 'shap' is not defined